SENDAs Agreement 1 Update 2010-2022 (step 2)

First step of deduplication process. Exploratory data analysis was conducted, addressing issues such as data entry errors, missing values, and the conversion of the date of birth into the age at the time of the first discharge for each individual.

Author

Andrés González Santa Cruz

Published

June 6, 2025


Data Loading and Exploration

Loading Packages and uniting databases

Proceed to load the necessary packages.

Code
invisible("Only run from Ubuntu")
if (!(Sys.getenv("RSTUDIO_SESSION_TYPE") == "server" || file.exists("/.dockerenv"))) {
  if(Sys.info()["sysname"]!="Windows"){
    Sys.setenv(RETICULATE_PYTHON = "/home/fondecytacc/.pyenv/versions/3.11.5/bin/python")
  }
}

#clean enviroment
rm(list = ls()); gc()
file.path(paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))),"data/20241015_out"))

wdpath<-
paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))))
wdpath

envpath<- if(regmatches(wdpath, regexpr("[A-Za-z]+", wdpath))=="G"){"G:/Mi unidad/Alvacast/SISTRAT 2023/"}else{"E:/Mi unidad/Alvacast/SISTRAT 2023/"}
envpath

time_before_dedup2<-Sys.time()

#base::load(paste0(wdpath,"data/20241015_out/","3_ndp_2025_05_30.Rdata"))
if (!(Sys.getenv("RSTUDIO_SESSION_TYPE") == "server" || file.exists("/.dockerenv"))) {
  file.path(paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))),"data/20241015_out"))
  
  wdpath<-
  paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))))
  wdpath
  
  envpath<- if(regmatches(wdpath, regexpr("[A-Za-z]+", wdpath))=="G"){"G:/Mi unidad/Alvacast/SISTRAT 2023/"}else{"E:/Mi unidad/Alvacast/SISTRAT 2023/"}
  envpath 
  
  base::load(paste0(wdpath,"data/20241015_out/","3_ndp_2025_06_02.Rdata"))

  } else {
    file.path(paste0(getwd(),"/_input"))
    paste0(getwd(),"/_input","/3_ndp_2025_06_02.Rdata")
    base::load(paste0(getwd(),"/_input","/3_ndp_2025_06_02.Rdata.enc"))
  }

time_before_dedup1<-Sys.time()
password <- Sys.getenv("PASS_PPIO")
system(sprintf("7z x path/to/_input/3_ndp_2025_06_02.Rdata.7z.001 -p'%s'", password))
          used (Mb) gc trigger (Mb) max used (Mb)
Ncells  605956 32.4    1279675 68.4  1086641 58.1
Vcells 1221570  9.4    8388608 64.0  2106365 16.1
[1] "G:/My Drive/Alvacast/SISTRAT 2023//data/20241015_out"
[1] "G:/My Drive/Alvacast/SISTRAT 2023//"
[1] "G:/Mi unidad/Alvacast/SISTRAT 2023/"
[1] 127
Code
#https://github.com/rstudio/renv/issues/544
#renv falls back to copying rather than symlinking, which is evidently very slow in this configuration.
renv::settings$use.cache(FALSE)

#only use explicit dependencies (in DESCRIPTION)
renv::settings$snapshot.type("implicit")

#check if rstools is installed
try(installr::install.Rtools(check_r_update=F))

Installing package into ‘G:/My Drive/Alvacast/SISTRAT 2023/renv/library/windows/R-4.4/x86_64-w64-mingw32’ (as ‘lib’ is unspecified)

Code
if(quarto::quarto_version()<"1.7.29"){
stop("You need to install a recent quarto version")   # la publicada el 28-abr-2025
}

#change repository to CL
local({
  r <- getOption("repos")
  r["CRAN"] <- "https://cran.dcc.uchile.cl/"
  options(repos=r)
})

if(!require(pacman)){install.packages("pacman");require(pacman)}

Cargando paquete requerido: pacman

Code
if(!require(pak)){install.packages("pak");require(pak)}

Cargando paquete requerido: pak

Code
pacman::p_unlock(lib.loc = .libPaths()) #para no tener problemas reinstalando paquetes

No 00LOCK detected in: G:/My Drive/Alvacast/SISTRAT 2023/renv/library/windows/R-4.4/x86_64-w64-mingw32 No 00LOCK detected in: C:/Program Files/R/R-4.4.1/library

Code
if(Sys.info()["sysname"]=="Windows"){
if (getRversion() != "4.4.1") { stop("Requires R version 4.4.1; Actual: ", getRversion()) }
}

#check docker
check_docker_running <- function() {
  # Try running 'docker info' to check if Docker is running
  system("docker info", intern = TRUE, ignore.stderr = TRUE)
}

install_docker <- function() {
  # Open the Docker Desktop download page in the browser for installation
  browseURL("https://www.docker.com/products/docker-desktop")
}

# Main logic
if (inherits(try(check_docker_running(), silent = TRUE), "try-error")) {
  liftr::install_docker()
} else {
  message("Docker is running.")
}

Warning in system(“docker info”, intern = TRUE, ignore.stderr = TRUE): el comando ejecutado ‘docker info’ tiene el estatus 1

Docker is running.

Code
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#PACKAGES#######################################################################
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_

unlink("*_cache", recursive=T)

# ----------------------------------------------------------------------
# 2. Use a single pak::pkg_install() call for most CRAN packages
# ----------------------------------------------------------------------

paks <-
  c(#"git", 
    # To connect to github
    "gh", #interface for  GitHub API from R
    #
    "gitcreds", # manages Git credentials (usernames, passwords, tokens)
    #
    "usethis", # simplifies common project setup tasks for R developers
    # Package to bring packages in development
    "devtools",
    # Package administration
    "renv",
    # To manipulate data
    "knitr", "pander", "DT",
    # Join
    "fuzzyjoin", "RecordLinkage",
    # For tables
    "tidyverse", "janitor",
    # For contingency tables
    "kableExtra",
    # For connections with python
    "reticulate",
    # To manipulate big data
    "polars", "sqldf",
    # To bring big databases
    "nanoparquet",
    # Interface for R and RStudio in R
    "installr", "rmarkdown", "quarto", "yaml", #"rstudioapi",
    # Time handling
    "clock",
    # Combine plots
    "ggpubr",
    # Parallelized iterative processing
    "furrr",
    # Work like a tibble with a data.table database
    "tidytable",
    # Split database into training and testing
    "caret",
    # Impute missing data
    "missRanger", "mice",
    # To modularize tasks
    "job",
    # For PhantomJS install checks
    "webshot"
  )

# dplyr
# janitor
# reshape2
# tidytable
# arrow
# boot
# broom
# car
# caret
# data.table
# DiagrammeR
# DiagrammeRsvg
# dplyr
# epiR
# epitools
# ggplot2
# glue
# htmlwidgets
# knitr
# lubridate
# naniar
# parallel
# polycor
# pROC
# psych
# readr
# rio
# rsvg
# scales
# stringr
# tableone
# rmarkdown
# biostat3
# codebook
# finalfit
# Hmisc
# kableExtra
# knitr
# devtools
# tidyr
# stringi
# stringr
# muhaz
# sqldf
# compareGroups
# survminer
# lubridate
# ggfortify
# car
# fuzzyjoin
# compareGroups
# caret
# job
# htmltools
# nanoparquet
# ggpubr
# polars
# installr
# clock
# pander
# reshape
# mice
# missRanger
# VIM
# withr
# biostat3
# broom
# glue
# finalfit
# purrr
# sf


# pak::pkg_install(paks)

pak::pak_sitrep()
# pak::sysreqs_check_installed(unique(unlist(paks)))
#pak::lockfile_create(unique(unlist(paks)),  "dependencies_duplicates24.lock", dependencies=T)
#pak::lockfile_install("dependencies_duplicates24.lock")
#https://rdrr.io/cran/pak/man/faq.html
#pak::cache_delete()

library(tidytable)

Adjuntando el paquete: ‘tidytable’

The following objects are masked from ‘package:stats’:

dt, filter, lag

The following object is masked from ‘package:base’:

%in%
Code
library(polars)

Warning: package ‘polars’ was built under R version 4.4.3

Code
library(ggplot2)
library(readr)

# ----------------------------------------------------------------------
# 3. Activate polars code completion (safe to try even if it fails)
# ----------------------------------------------------------------------
try(polars_code_completion_activate())

Using code completion in ‘native’ mode.

Code
# ----------------------------------------------------------------------
# 4. BPMN from GitHub (not on CRAN, so install via devtools if missing)
# ----------------------------------------------------------------------
if (!requireNamespace("bpmn", quietly = TRUE)) {
  devtools::install_github("bergant/bpmn")
}

# ----------------------------------------------------------------------
# 5. PhantomJS Check (use webshot if PhantomJS is missing)
# ----------------------------------------------------------------------
# if (!webshot::is_phantomjs_installed()) {
#   webshot::install_phantomjs()
# }

#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#FUNCTIONS######################################################################
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_

copiar_nombres <- function(x,row.names=FALSE,col.names=TRUE,dec=",",...) {
  if(class(try(dplyr::ungroup(x)))[1]=="tbl_df"){
    if(options()$OutDec=="."){
      options(OutDec = dec)
      write.table(format(data.frame(x)),"clipboard",sep="\t",row.names=FALSE,col.names=col.names,...)
      options(OutDec = ".")
      return(x)
    } else {
      options(OutDec = ",")
      write.table(format(data.frame(x)),"clipboard",sep="\t",row.names=FALSE,col.names=col.names,...)
      options(OutDec = ",")
      return(x)    
    }
  } else {
    if(options()$OutDec=="."){
      options(OutDec = dec)
      write.table(format(x),"clipboard",sep="\t",row.names=FALSE,col.names=col.names,...)
      options(OutDec = ".")
      return(x)
    } else {
      options(OutDec = ",")
      write.table(format(x),"clipboard",sep="\t",row.names=FALSE,col.names=col.names,...)
      options(OutDec = ",")
      return(x)       
    }
  }
}  

#WINDOWS do not restrict memory size
if(.Platform$OS.type == "windows") withAutoprint({
  memory.size()
  memory.size(TRUE)
  memory.limit()
})

Warning: ‘memory.size()’ is no longer supported

Warning: ‘memory.size()’ is no longer supported

Warning: ‘memory.limit()’ is no longer supported

Code
memory.limit(size=56000)

Warning: ‘memory.limit()’ is no longer supported

Code
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#NAs are replaced with "" in knitr kable
options(knitr.kable.NA = '')

pander::panderOptions('big.mark', ',')
pander::panderOptions('decimal.mark', '.')

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#
#to format rows in bold
format_cells <- function(df, rows ,cols, value = c("italics", "bold", "strikethrough")){

  # select the correct markup
  # one * for italics, two ** for bold
  map <- setNames(c("*", "**", "~~"), c("italics", "bold", "strikethrough"))
  markup <- map[value]  

  for (r in rows){
    for(c in cols){

      # Make sure values are not factors
      df[[c]] <- as.character( df[[c]])

      # Update formatting
      df[r, c] <- ifelse(nchar(df[r, c])==0,"",paste0(markup, gsub(" ", "", df[r, c]), markup))
    }
  }

  return(df)
}
#To produce line breaks in messages and warnings
knitr::knit_hooks$set(
   error = function(x, options) {
     paste('\n\n<div class="alert alert-danger" style="font-size: small !important;">',
           gsub('##', '\n', gsub('^##\ Error', '**Error**', x)),
           '</div>', sep = '\n')
   },
   warning = function(x, options) {
     paste('\n\n<div class="alert alert-warning" style="font-size: small !important;">',
           gsub('##', '\n', gsub('^##\ Warning:', '**Warning**', x)),
           '</div>', sep = '\n')
   },
   message = function(x, options) {
     paste('<div class="message" style="font-size: small !important;">',
           gsub('##', '\n', x),
           '</div>', sep = '\n')
   }
)

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:

sum_dates <- function(x){
  
  cbind.data.frame(
min= as.Date(min(unclass(as.Date(x)), na.rm=T), origin = "1970-01-01"),
p001= as.Date(quantile(unclass(as.Date(x)), .001, na.rm=T), origin = "1970-01-01"),
p005= as.Date(quantile(unclass(as.Date(x)), .005, na.rm=T), origin = "1970-01-01"),
p025= as.Date(quantile(unclass(as.Date(x)), .025, na.rm=T), origin = "1970-01-01"),
p25= as.Date(quantile(unclass(as.Date(x)), .25, na.rm=T), origin = "1970-01-01"),
p50= as.Date(quantile(unclass(as.Date(x)), .5, na.rm=T), origin = "1970-01-01"),
p75= as.Date(quantile(unclass(as.Date(x)), .75, na.rm=T), origin = "1970-01-01"),
p975= as.Date(quantile(unclass(as.Date(x)), .975, na.rm=T), origin = "1970-01-01"),
p995= as.Date(quantile(unclass(as.Date(x)), .995, na.rm=T), origin = "1970-01-01"),
p999= as.Date(quantile(unclass(as.Date(x)), .999, na.rm=T), origin = "1970-01-01"),
max= as.Date(max(unclass(as.Date(x)), na.rm=T), origin = "1970-01-01")
  )
}

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:

# Define the function adapted for Polars
sum_dates_polars <- function(df, date_col) {
  # Create the list of quantiles
  quantiles <- c(0.001, 0.005, 0.025, 0.25, 0.5, 0.75, 0.975, 0.995, 0.999)
  # Create expressions to calculate min and max
  expr_list <- list(
    pl$col(date_col)$min()$alias("min"),
    pl$col(date_col)$max()$alias("max")
  )
  # Add expressions for quantiles
  for (q in quantiles) {
    expr_list <- append(expr_list, pl$col(date_col)$quantile(q)$alias(paste0("p", sub("\\.", "", as.character(q)))))
  }
  # Apply the expressions and return a DataFrame with the results
  df$select(expr_list)
}

# Custom function for sampling with a seed
sample_n_with_seed <- function(data, size, seed) {
  set.seed(seed)
  dplyr::sample_n(data, size)
}

# Function to get the most frequent value 
most_frequent <- function(x) { 
  uniq_vals <- unique(x)
  freq_vals <- sapply(uniq_vals, function(val) sum(x == val))
  most_freq <- uniq_vals[which(freq_vals == max(freq_vals))]
  
  if (length(most_freq) == 1) {
    return(most_freq)
  } else {
    return(NA)
  }
}
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#CONFIG #######################################################################
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_

options(scipen=2) #display numbers rather scientific number

#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_

# Define the function first
#oins these values with semicolons and optionally truncates the result if it exceeds a specified width.
toString2 <- function(x, width = NULL, ...) {
    string <- paste(x, collapse = "; ")
    if (missing(width) || is.null(width) || width == 0) 
        return(string)
    if (width < 0) 
        stop("'width' must be positive")
    if (nchar(string, type = "w") > width) {
        width <- max(6, width)
        string <- paste0(substr(string, 1, width - 3), "...")
    }
    string
}
Error in contrib.url(repos, "source") : 
  trying to use CRAN without setting a mirror
* pak version:
- 0.8.0.1
* Version information:
- pak platform: x86_64-w64-mingw32 (current: x86_64-w64-mingw32, compatible)
- pak repository: - (local install?)
* Optional packages installed:
- pillar
* Library path:
- G:/My Drive/Alvacast/SISTRAT 2023/renv/library/windows/R-4.4/x86_64-w64-mingw32
- C:/Program Files/R/R-4.4.1/library
* pak is installed at G:/My Drive/Alvacast/SISTRAT 2023/renv/library/windows/R-4.4/x86_64-w64-mingw32/pak.
* Dependency versions:
- callr      3.7.6      
- cli        3.6.2      
- curl       5.2.1      
- desc       1.4.3      
- filelock   1.0.3      
- jsonlite   1.8.8      
- lpSolve    5.6.23.9000
- pkgbuild   1.4.4      
- pkgcache   2.2.2.9000 
- pkgdepends 0.7.2.9000 
- pkgsearch  3.1.3.9000 
- processx   3.8.4      
- ps         1.7.6      
- R6         2.5.1      
- zip        2.3.1      
* Dependencies can be loaded
> memory.size()
[1] Inf
> memory.size(TRUE)
[1] Inf
> memory.limit()
[1] Inf
[1] Inf


Note

To assess the main goals of the study, we first focused on distinguishing each user across the yearly datasets obtained from SENDA (1). Next, we separated each user’s treatments (2). Finally, we normalized, standardized, and cleaned each treatment (3). Although these stages may appear conceptually separate and sequential, they are interdependent (e.g., some variables needed to be standardized to identify duplicate entries).

Throughout this document, we use the terms “rows”, “cases”, “observations” or “treatment episodes” interchangeably to refer to entries in the dataset.

The previous document revealed overlapping cases and nearly identical records, as well as patients with unfinished treatments (i.e., missing discharge dates in the 2018-2019 databases) who might have received subsequent overlapping treatments.


pre-0. Missing discharge dates due to truncation in dataset retrieval

We observe that there are some cases with missing treatment discharge dates, as if they were still ongoing, because the responsible institution provided us with a database only up to the date of submission of the previous project (namely, November 2019). Therefore, the conclusion of those treatments (whether dropout, administrative discharge, or therapeutic discharge) cannot be determined. We also restricted these observations if they had a treatment completion status of “currently in”, as there were a few observations with days in admission that generated discharge dates previous to 2019 with a finished record.

Code
SISTRAT23_c1_2010_2022_df_prev1g|>
  filter(is.na(disch_date_num)) |> 
  mutate(disch_date_na= as.Date(adm_date_rec_num+ dias_en_tratamiento, origin = "1970-01-01")) |>
  select(TABLE_rec, rn, hash_key, dias_en_tratamiento, adm_age_rec, adm_date_rec, disch_date_na, id_centro, tr_compliance, plan_type, senda)|> 
  filter(disch_date_na<"2023-04-28" & grepl("currently",tr_compliance))|> 
  (\(df) { 
    cat(paste0("00. Missing discharge dates due to truncation, cases: ", formatC(nrow(df), big.mark=",")),"\n")
    cat(paste0("00. Missing discharge dates due to truncation, RUNs: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
    distinct(df, hash_key)|> pull(hash_key) ->> hash_truncated_treatments_due_to_retrieval_2019 
    df|> pull(rn) ->> rows_truncated_treatments_due_to_retrieval_2019 
    df
    })()|> 
  #View()
  filter(hash_key %in% (sample_n_with_seed(data.frame(hash_truncated_treatments_due_to_retrieval_2019),20, seed=2125)|> pull(1)))|>
  mutate(hash_key= as.numeric(factor(hash_key)))|>
  knitr::kable("markdown", caption= "Missing discharge dates due to administrative truncation (sample)")
00. Missing discharge dates due to truncation, cases: 876 
00. Missing discharge dates due to truncation, RUNs: 875 
Missing discharge dates due to administrative truncation (sample)
TABLE_rec rn hash_key dias_en_tratamiento adm_age_rec adm_date_rec disch_date_na id_centro tr_compliance plan_type senda
20191 165217 1 205 33.17 2019-04-22 2019-11-13 415 currently in pg-pab si
20191 166554 2 197 27.61 2019-04-30 2019-11-13 109 currently in pg-pai si
20151 73018 3 1734 38.89 2015-02-05 2019-11-05 139 currently in pg-pab si
20191 168790 4 114 45.00 2019-07-22 2019-11-13 497 currently in pg-pai si
20191 170585 5 104 51.37 2019-08-01 2019-11-13 667 currently in pg-pai si
20191 169192 6 141 29.24 2019-06-25 2019-11-13 489 currently in pg-pab si
20191 155751 7 723 53.68 2017-11-20 2019-11-13 628 currently in pg-pab si
20191 168082 8 162 43.62 2019-06-04 2019-11-13 432 currently in m-pr si
20191 155164 9 1336 35.78 2016-03-17 2019-11-13 146 currently in pg-pai si
20191 156096 10 627 31.05 2018-02-24 2019-11-13 625 currently in pg-pab si
2011 20203 11 2943 38.85 2011-10-15 2019-11-05 269 currently in pg-pr no
20161 88146 12 1610 26.49 2015-06-09 2019-11-05 556 currently in pg-pai si
20181 126990 13 979 20.24 2017-03-01 2019-11-05 307 currently in pg-pr si
20191 168526 14 127 18.37 2019-07-09 2019-11-13 614 currently in pg-pab si
20181 130398 15 762 26.70 2017-10-04 2019-11-05 239 currently in pg-pai si
2010 5236 16 3906 31.65 2009-02-24 2019-11-05 280 currently in pg-pab si
20161 88300 17 1572 16.25 2015-07-17 2019-11-05 291 currently in pg-pr no
20191 170097 18 91 49.82 2019-08-14 2019-11-13 612 currently in pg-pab si
20191 169808 19 99 58.67 2019-08-06 2019-11-13 489 currently in pg-pab si
20191 160795 20 330 18.01 2018-12-18 2019-11-13 138 currently in pg-pai si

However, it is important to note that if those treatments had continued, they would appear in the following year’s database. We successfully imported an updated 2019 database and attempted to standardize it according to the formatting applied before the initial step of the deduplication phase, as documented on June 2, 2025. For the remaining cases, and while we attempt to obtain additional complementary databases, a discharge date of December 31, 2019, was imputed.

After the imputation, we corrected cases by creating a join key (concat) by combining each patient’s hash_key with their admission date (adm_date_rec), then merging the main dataset (SISTRAT23_c1_2010_2022_df_prev1h) with the updated discharge information provided by SENDA professionals by that key (parsing the joined discharge date into a proper Date object). After selecting the relevant variables, we grouped the data by hash_key and, for any group with exactly one record whose original disch_date_rec0 equals December 31, 2019, assigns the actual parsed disch date to a new column (new_disch); all other rows receive NA. Finally, it ungroups and filters to keep only those rows where new_disch is not missing. This criteria responded to the request that was based in the HASH key only (not in the combination with admission date).

Code
X2019_2019dup_encrip <- readr::read_delim("G:/My Drive/Alvacast/SISTRAT 2023/data/20250508_original_data/2019_2019dup_encrip.csv", 
                                          delim = ";", escape_double = FALSE, trim_ws = TRUE, 
                                          locale = locale(decimal_mark = ",", grouping_mark = ".", tz = "America/Santiago", 
                                                          encoding = "latin1"),#encoding = "ISO-8859-1"),
                                          na = c("", "NA","null"),
                                          guess_max = min(1e5, Inf)) |> janitor::clean_names()|> 
  mutate(cod_indentificacion= tolower(codigo_identificaci_a_a3n))

Rows: 16383 Columns: 92 ── Column specification ──────────────────────────────────────────────────────── Delimiter: “;” chr (79): CodigoIdentificación, NombreCentro, TipoCentro, RegióndelCen… dbl (11): DiasenTratamiento, NMesesenTratamiento, DiasenSENDA, NMesesenSENDA… lgl (2): ParentescoconelJefedeHogar, DiagnósticoTrastornoPsiquiátri

ℹ Use spec() to retrieve the full column specification for this data. ℹ Specify the column types or set show_col_types = FALSE to quiet this message.

Code
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
cat("Format as most as possible the updated 2019 database in terms of dates")
df2019_mod_maureen <- 
X2019_2019dup_encrip|>
  filter(hashkey %in% hash_truncated_treatments_due_to_retrieval_2019
  )|> 
  #select(TABLE, hash_key, adm_date_rec, disch_date)
  select(hashkey, diasen_tratamiento, fecha_ingresoa_tratamiento, fecha_egresode_tratamiento, motivode_egreso) |> 
  mutate(discharge_date= stringr::str_replace_all(fecha_egresode_tratamiento,"/","-"))%>% 
  mutate(discharge_date= readr::parse_date(discharge_date, format="%d-%m-%Y")) |> 
  mutate(adm_date_orig= stringr::str_replace_all(fecha_ingresoa_tratamiento,"/","-"))%>% 
  mutate(adm_date_orig= readr::parse_date(adm_date_orig, format="%d-%m-%Y"))|> 
  mutate(discharge_date= as.character(discharge_date))|> 
  tidytable::mutate(discharge_date = tidytable::case_when(
    #rn== 1294
    hashkey=="c4795829b6ea9cfc50b988c85deb391fa041d99a0ebca6b68a1378f37e3eb420" & adm_date_orig=="2009-06-30" ~ "2019-12-30",
    #rn== 1934 
    hashkey=="23874d59570adaac6690c85481b869570c10c2f8931fc20636037cdff04af067" & adm_date_orig=="2008-07-02" ~ "2009-05-13",
    #rn== 1938 
    hashkey=="5a16413f76625a09585c89fd3ea4fb05d1ea5cbfbc18247a9fb6e7e21534562d" & adm_date_orig=="2008-07-23" ~ "2009-04-14",
    #rn== 2602 
    hashkey=="11b143acdce4bf1d3a72acd4a703ea8c38543fd02585b4f3b0433e227929ed3c" & adm_date_orig=="2008-03-04" ~ "2009-09-15",
    #rn== 2603 
    hashkey=="986ded00e6ca834805a169ed528655e22f819bf5104d1729b2e1453f20f38065" & adm_date_orig=="2008-12-05" ~ "2009-06-02",
    #rn== 2604 
    hashkey=="d402a1e13f25b2411ca346b0dc84b9fffa45887e628abf09262777b6deae85aa" & adm_date_orig=="2009-06-09" ~ "2009-06-09",
    #rn== 2896
    hashkey=="0d248b372c7224ae2cc1cabb750d6201150175b5d65ec0397ff2127d32b6b675" & adm_date_orig=="2009-02-05" ~ "2009-03-09",
    #rn== 3198
    hashkey== "6eb67e1ead556eb1dbd21951747440057a17a872b33b468a37c9bf781219cef8" & adm_date_orig=="2009-10-07" ~ "2010-04-10",
    #rn== 3260
    hashkey=="e0acff1477306ee93abfca7e251cc6d23db916b390a9fe506fbbefc371ce1d43" & adm_date_orig=="2009-12-07" ~ "2010-06-01",
    #rn== 5175
    hashkey=="eb13b44585501a35df9ce6d262ca6e69e4aa34063af219e19cc95e7609e38cdf" & adm_date_orig=="2010-04-26" ~ "2011-05-03",
    #rn== 5760
    hashkey=="058e8b2c02f98d488a78d78d80435e516c6628cd7edb87ecaf9f8c981d9614ba" & adm_date_orig=="2010-05-03" ~ "2010-10-04",      #rn== 6354
    hashkey=="4d42363412d6a435dd2762bbee7f9b4fe4117ff4c94d55e10472342156238ccb" & adm_date_orig=="2010-06-17" ~ "2010-07-01", 
    #rn== 5760
    hashkey=="058e8b2c02f98d488a78d78d80435e516c6628cd7edb87ecaf9f8c981d9614ba" & adm_date_orig=="2010-05-03" ~ "2010-10-04", 
    #rn== 8176
    hashkey=="228fc5b7b88c5f544f71f9ecfbad4d1750470b717f869a7aa9f01b0169a5d890" & adm_date_orig=="2010-07-01" ~ "2011-01-13", 
    #rn== 8756
    hashkey=="7ebe4155bb7741beef0f30ce47ecbc735bd1f7137d22e81ba21d5f12f8398fa2" & adm_date_orig=="2010-10-04" ~ "2011-01-31", 
    #rn== 5760
    hashkey=="058e8b2c02f98d488a78d78d80435e516c6628cd7edb87ecaf9f8c981d9614ba" & adm_date_orig=="2010-05-03" ~ "2010-10-04", 
    #rn== 9092
    hashkey=="93478aa27b121dbad91cb8e36ef60caa42fce6ca5b99478a77e9b8478df600f3" & adm_date_orig=="2010-11-23" ~ "2011-01-14", 
    #rn== 9171
    hashkey=="6500209f17b52ab4e00a140f7c8f0a10d9b073f81ac9443203f0a1b84c4dc1e8" & adm_date_orig=="2010-11-25" ~ "2011-06-10", 
    #rn== 9177
    hashkey=="4d6e97bfc2aeb15a8c6457ad1c84335de48b5456177b9749159ec2974537634f" & adm_date_orig=="2010-11-25" ~ "2011-06-20", 
    #rn== 9444
    hashkey=="1d5a63a966cea8241228f0057a38ef4e63e0fb353dda174dc95d4393e4cdcefa" & adm_date_orig=="2010-12-02" ~ "2011-06-10", 
    #rn== 10424
    hashkey=="eb13b44585501a35df9ce6d262ca6e69e4aa34063af219e19cc95e7609e38cdf" & adm_date_orig=="2010-04-26" ~ "2011-05-03", 
    #rn== 11482
    hashkey=="228fc5b7b88c5f544f71f9ecfbad4d1750470b717f869a7aa9f01b0169a5d890" & adm_date_orig=="2010-07-01" ~ "2011-01-13",  
    #rn== 12097
    hashkey=="6500209f17b52ab4e00a140f7c8f0a10d9b073f81ac9443203f0a1b84c4dc1e8" & adm_date_orig=="2010-11-25" ~ "2011-06-10",      #rn== 12102
    hashkey=="4d6e97bfc2aeb15a8c6457ad1c84335de48b5456177b9749159ec2974537634f" & adm_date_orig=="2010-11-25" ~ "2011-06-20",      #rn== 12301
    hashkey=="1d5a63a966cea8241228f0057a38ef4e63e0fb353dda174dc95d4393e4cdcefa" & adm_date_orig=="2010-12-02" ~ "2011-06-10",      #rn== 13086
    hashkey=="c75bb8c43963dbad7a1b311497073a58b0e97bb82c5c63a4bc7ae4d1c9014592" & adm_date_orig=="2011-01-13" ~ "2011-07-10",  
    #rn== 13644
    hashkey=="f40999d751e9eb84f5ed6d832d96a1de872599c181e28dd420507c58d7464ccf" & adm_date_orig=="2011-02-08" ~ "2011-08-04", 
    #rn== 14099
    hashkey=="dbe7ddec7591332da15c3c4a1d4a2a1559d455a67b6c31a390ea546ea259c045" & adm_date_orig=="2011-02-10" ~ "2011-05-03", 
    #rn== 14339
    hashkey=="05ff2bf96ef3a294c09b39cf91c19f7a74b080487f13f62c449812f14cefff37" & adm_date_orig=="2011-03-22" ~ "2011-07-31", 
    #rn== 15403
    hashkey=="bdf81829448433489a21d8ac17de96f3765707798d8e2beb7653414f43f272aa" & adm_date_orig=="2011-04-15" ~ "2011-06-12", 
    #rn== 16016
    hashkey=="0bd45263c5217ae4324c23ca4bfec945d4100276fcac4e3e66ad5b6f5341d3fd" & adm_date_orig=="2011-05-20" ~ "2011-06-01", 
    #rn== 16150
    hashkey=="d6d0aaa21c50981871615a6b8886d1f69a3d0f125165f63f6a1c54729be5eea2" & adm_date_orig=="2011-05-23" ~ "2011-06-05",  
    #rn== 16413
    hashkey=="4728851a593a1490d73682e45945fe0f253d0f18dfc12aa1d2d21deef206c39c" & adm_date_orig=="2011-04-18" ~ "2011-08-30", 
    #rn== 16742
    hashkey=="caafb47faaab3c9637821a50ce4dcef33b8e3a9fc275f0ef76f0c93681eb15ba" & adm_date_orig=="2011-06-06" ~ "2011-07-04", 
    #rn== 16745
    hashkey=="18096679bef8db59dbd0ca3be91fa36d7d9dcbbf06b85be2662f410d0146d1a2" & adm_date_orig=="2011-06-17" ~ "2011-07-31", 
    #rn== 16755
    hashkey=="40d3ff594c6c3ddd96e37e5e53fbd22030916a99a4f04cf6283ad188058f2a5b" & adm_date_orig=="2011-06-23" ~ "2011-07-07", 
    #rn== 17500
    hashkey=="667766680894eb203756044682c8445365bb0a831012ec49341b080390133d5d" & adm_date_orig=="2011-06-20" ~ "2011-08-02", 
    #rn== 30449
    hashkey=="60e3066c438a10246353d3a3bce07a58fbfda39465aa84debd48cede21319a94" & adm_date_orig=="2012-10-16" ~ "2013-08-13", 
    #rn== 34193
    hashkey=="60e3066c438a10246353d3a3bce07a58fbfda39465aa84debd48cede21319a94" & adm_date_orig=="2012-10-16" ~ "2013-08-13", 
    #rn== 35638
    hashkey=="08a5dc9a016c0525d7ceea954a8078391701ea9743b71bc2a012f0949952029f" & adm_date_orig=="2013-01-07" ~ "2013-07-17", 
    #rn== 36161
    hashkey=="71049ebb5d958e0647c01c4398c91ff3e02275f7dc5e2fefee5bc263a7653c96" & adm_date_orig=="2013-01-28" ~ "2013-08-12", 
    #rn== 36415
    hashkey=="52e218f6406835e8624ffe71595152560ec44a02a7580d673019eefa88df7a61" & adm_date_orig=="2013-01-29" ~ "2013-04-02", 
    #rn== 37116
    hashkey=="22c282462adfb8e48b3a6b697d533244c9c656a6b31ff87d0180679d9f5ce98d" & adm_date_orig=="2013-02-08" ~ "2013-08-02", 
    #rn== 37958
    hashkey=="221d71ae6c4dba4aee931b3ee518d47fd3972fed3fbf7f4d44c676bedca786c4" & adm_date_orig=="2013-03-18" ~ "2013-07-10", 
    #rn== 38907
    hashkey=="877ea9b68dde038d9f63d04d4e65d1eb27ac3f46af22e310c7c2114feb7f871b" & adm_date_orig=="2013-04-18" ~ "2013-07-31", 
    #rn== 38908
    hashkey=="14af0ddf318fb49877b16491b0fb7df491d98bd32dd854bdbec526f898dd9946" & adm_date_orig=="2013-04-18" ~ "2013-06-17", 
    #rn== 38909
    hashkey=="243a1044f746ae87432532552b4b93b6978fb3b18fa3a4305a11b2af698eb013" & adm_date_orig=="2013-04-16" ~ "2013-07-27", 
    #rn== 39617
    hashkey=="0e729e637c95d5d4486a7f822d14f0f1925ac358fff61d9bba9d7407b8e9abe7" & adm_date_orig=="2013-04-29" ~ "2013-07-25", 
    #rn== 39618
    hashkey=="289a7b6c884980dc60c9171bb05939bacf18a62551ebda723af75cbfc8308db9" & adm_date_orig=="2013-05-08" ~ "2013-07-14", 
    #rn== 39620
    hashkey=="cde086d548022a94e623bfc3d6b34202b28141ed2134ba35425ce4807e75f2fb" & adm_date_orig=="2013-04-29" ~ "2013-07-02", 
    #rn== 40045
    hashkey=="10fc40384411161967b222bf530a0378e0ae585bd69370d57d9c4fb49a1a34c3" & adm_date_orig=="2013-05-22" ~ "2013-08-02",  
    #rn== 40293
    hashkey=="67353760ae53ad8963176af0ec6cab9c4bdad13b9e53058e68e53f80b409b224" & adm_date_orig=="2013-05-29" ~ "2013-08-07",
    #rn== 40599
    hashkey=="3ce639d4d0330242d1f7c1e6496e834ad3fa2b41bef89b09bc373e9dede8c981" & adm_date_orig=="2013-05-02" ~ "2013-07-03",
    #rn== 41114
    hashkey=="5e6d9dcec9e717d4536f7cfa5cc0f713e7c2c7933058aeb9a37fec0a24da5151" & adm_date_orig=="2013-06-06" ~ "2013-07-31",
    #rn== 41117
    hashkey=="e01e3218ba73e9d26178e7a6aceb86357695bc88117f1d7b89c8adbf55210528" & adm_date_orig=="2013-06-05" ~ "2013-06-27",
    #rn== 42456
    hashkey=="421abbc2c85687aa87adec1c3146debf5ddea3ea71f65d708c2cf4d4dde86e38" & adm_date_orig=="2013-07-02" ~ "2013-07-08",
    #rn== 42633
    hashkey=="567f1fd735550a9bc1a2ea8a838d87b69369caa106c2d0cd0a1b38581d09919f" & adm_date_orig=="2013-07-09" ~ "2013-08-16",
    #rn== 42634
    hashkey=="7f259b5289b209cc669db813abfcd14519a21c4f69aaeb0190f094c61a52afad" & adm_date_orig=="2013-06-28" ~ "2013-07-09", 
    #rn== 42854
    hashkey=="49cca05a51baac5c836a053eac96674c775e2d7164209a04f09f8da34952b789" & adm_date_orig=="2013-07-02" ~ "2013-08-02",
    #rn== 43076
    hashkey=="6adbbaff91e32138777abcf66a161d953722255c88368f9a5877d1ddfa48decd" & adm_date_orig=="2013-08-06" ~ "2013-08-20",
    #rn== 43181
    hashkey=="02c866ee44e5a3a310cf18728753e3a4c3751d4ea4d61edc22d78606cde0fcc8" & adm_date_orig=="2013-08-01" ~ "2013-08-16",
    #rn== 43182
    hashkey=="506be60207917af56fa39175f11ee5b3b874c0883245e37d0b2a79e0b24f08ad" & adm_date_orig=="2013-08-01" ~ "2013-08-22",
    TRUE ~ as.character(discharge_date)
  ))|> 
  tidytable::mutate(discharge_date= readr::parse_date(discharge_date, format="%Y-%m-%d"), motivode_egreso= tolower(motivode_egreso))|>
    #Early vs. late dropout
  tidytable::mutate(dit_earl_drop= ifelse(diasen_tratamiento>=90 & !is.na(diasen_tratamiento),0,1))|>
  #changed the order of the labels
  tidytable::mutate(dit_earl_drop= factor(dit_earl_drop, labels=c(">= 90 days","<90 days")))|> #t.test(dit_rec~ dit_earl_drop, data= df)
  tidytable::mutate(
    tr_compliance = case_when(
      grepl("<", dit_earl_drop) & grepl("abando",motivode_egreso)         ~ "early dropout",
      grepl(">", dit_earl_drop) & grepl("abando",motivode_egreso)         ~ "late dropout",
      grepl("<", dit_earl_drop) & grepl("adm", motivode_egreso)          ~ "early adm discharge",
      grepl(">", dit_earl_drop) & grepl("adm", motivode_egreso)          ~ "late adm discharge",
      grepl("alta ter", motivode_egreso)                              ~ "completion",
      motivode_egreso == "muerte"                                         ~ "death",
      grepl("derivac", motivode_egreso)                                     ~ "referral",
      is.na(motivode_egreso)                                              ~ "currently in",
      TRUE                                                                ~ "other"
    )
  ) #|>   janitor::tabyl(tr_compliance)
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:


SISTRAT23_c1_2010_2022_df_prev1g|>
  filter(is.na(disch_date_num)) |> 
  mutate(disch_date_na= as.Date(adm_date_rec_num+ dias_en_tratamiento, origin = "1970-01-01")) |>
  (\(df){
    cat("Table of dates of discharge with days in treatment\n")
    print(table(df$disch_date_na))
    cat("We should discard dates previous to 2023-04-28 because they were part of the actuala dministrative truncation process\n\n")
    df
  })() |> 
  select(TABLE_rec, rn, hash_key, dias_en_tratamiento, adm_age_rec, adm_date_rec, disch_date, disch_date_na, id_centro, tr_compliance, plan_type, senda) |> 
  #filter(disch_date_na<"2023-04-28" & grepl("currently",tr_compliance)) |> 
  filter(disch_date_na<"2023-04-28")|> 
  #Take only 
  inner_join(df2019_mod_maureen, by= c("hash_key"="hashkey", "adm_date_rec"="adm_date_orig")) |> 
  select(-fecha_ingresoa_tratamiento, -fecha_egresode_tratamiento)|> 
  (\(df) { 
    cat(paste0("New discharge date from updated C1 2019 database (discarding discharges in 2024-04-28; COINCIDENCE by HASH & admission date), cases: ", formatC(nrow(filter(df, !is.na(discharge_date))), big.mark=",")),"\n")
    cat(paste0("New discharge date from updated C1 2019 database (discarding discharges in 2024-04-28; COINCIDENCE by HASH & admission date), RUNs: ", formatC(nrow(distinct(filter(df, !is.na(discharge_date)), hash_key)), big.mark=","),"\n\n"))
    #export the records with discharge dates that didnt have earlier
    base::subset(df, 
    subset = !is.na(discharge_date), 
    select = c("rn","hash_key", "adm_date_rec", "discharge_date", "tr_compliance.y")) ->> hashs_dates_updated_disch_date
    cat("Lets check yearly database origin. Where do they come from?...\n")
    print(janitor::tabyl(df, TABLE_rec))
  })()  
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_

cat("Corrected the database with updated discharge dates")

colnames(hashs_dates_updated_disch_date) <- c("rny", "hash_key", "date_adm", "date_disch", "tr_comp")

SISTRAT23_c1_2010_2022_df_prev1h<-
SISTRAT23_c1_2010_2022_df_prev1g|>
      (\(df) {
    cat(paste0("4.pre. Database before correcting discharge dates, cases: ", formatC(nrow(df), big.mark=",")),"\n")
    cat(paste0("4.pre. Database before discarding discharge dates, RUNs: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
    df
      })()|> 
  #left_join(hashs_dates_updated_disch_date, by=c("hash_key"="hash_key", "adm_date_rec"="date_adm"))|> 
  left_join(hashs_dates_updated_disch_date[,c("rny", "date_disch", "tr_comp")], by=c("rn"="rny"))|> 
        (\(df) {
      if (nrow(df)> nrow(SISTRAT23_c1_2010_2022_df_prev1g))stop("Error: Added treatment episodes in the process")
          df
     })()|> 
  mutate(date_disch_num= unclass(date_disch))|> 
  #If it has an updated discharge date, replace with this; if not and applies, add Dec 31th, 2019. If not, preserve date.
  mutate(disch_date_rec0_num= case_when(rn %in% hashs_dates_updated_disch_date$rny~ date_disch_num, rn %in% setdiff(rows_truncated_treatments_due_to_retrieval_2019, hashs_dates_updated_disch_date$rny)~ 18261, T~ disch_date_num))|>
  mutate(disch_date_rec0= case_when(rn %in% hashs_dates_updated_disch_date$rny~ date_disch, rn %in% setdiff(rows_truncated_treatments_due_to_retrieval_2019, hashs_dates_updated_disch_date$rny)~  as.Date("2019-12-31"), T~disch_date))|> 
  mutate(dit_rec1= case_when(rn %in% hashs_dates_updated_disch_date$rny~  (date_disch_num- adm_date_rec_num), rn %in% setdiff(rows_truncated_treatments_due_to_retrieval_2019, hashs_dates_updated_disch_date$rny)~ (disch_date_rec0_num- adm_date_rec_num), T~ dit_rec))|> 
  #added the updated status; if not available, we imputed the "adm truncated"
  #the rest keeps their tr. compliance status
  mutate(tr_compliance_rec= case_when(rn %in% hashs_dates_updated_disch_date$rny~ tr_comp, rn %in% setdiff(rows_truncated_treatments_due_to_retrieval_2019, hashs_dates_updated_disch_date$rny)~ "adm truncated", T~tr_compliance))|>
  mutate(OBS= case_when(rn %in% hashs_dates_updated_disch_date$rny~ paste0(OBS, "; 4.pre. Missing discharge dates due administrative truncation in 2019, updated"), T~OBS))|> 
  mutate(OBS= case_when(rn %in% setdiff(rows_truncated_treatments_due_to_retrieval_2019, hashs_dates_updated_disch_date$rny)~ paste0(OBS, "; 4.pre. Missing discharge dates due administrative truncation in 2019, imputed"), T~OBS))|> 
          (\(df) {
    cat(paste0("4.pre. Database after correcting discharge dates, cases: ", formatC(nrow(df), big.mark=",")),"\n")
    cat(paste0("4.pre. Database after discarding discharge dates, RUNs: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
    if (nrow(df)> nrow(SISTRAT23_c1_2010_2022_df_prev1g))stop("Error: Added treatment episodes in the process")
    df
      })() |> 
  select(-date_disch)#, -tr_comp)


table(SISTRAT23_c1_2010_2022_df_prev1h$tr_comp)|> 
  data.frame()|> 
  left_join(data.frame(prop.table(table(SISTRAT23_c1_2010_2022_df_prev1h$tr_comp))), by="Var1")|> 
  rename("n"="Freq.x", "%"="Freq.y")|> 
  mutate(`%`=scales::percent(`%`))|> 
  left_join(psych::describeBy(SISTRAT23_c1_2010_2022_df_prev1h$dit_rec1, SISTRAT23_c1_2010_2022_df_prev1h$tr_comp, mat=T, quant = c(0.25, 0.75), digits=2)[,c("group1", "mean", "median", "Q0.25", "Q0.75")], by= c("Var1"="group1"))|> 
  knitr::kable("markdown", caption= "Tr compliance status of the updated records")

#remove original variable. Now we have the merged (tr_compliance_rec)
SISTRAT23_c1_2010_2022_df_prev1h$tr_comp <- NULL

#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
cat("2025-06-05: Maureen saved some cases\n")

maureen_casos_disch_miss <- readr::read_tsv("https://docs.google.com/spreadsheets/u/2/d/1X0jBuHooVx5RnV9p3fu0tqOlb9JVSyT0ZRSrhxM4kEI/export?format=tsv&id=1X0jBuHooVx5RnV9p3fu0tqOlb9JVSyT0ZRSrhxM4kEI&gid=0")

Rows: 381 Columns: 3 ── Column specification ──────────────────────────────────────────────────────── Delimiter: “ chr (2): concat, rescate FE dbl (1): posicion_en_bd

ℹ Use spec() to retrieve the full column specification for this data. ℹ Specify the column types or set show_col_types = FALSE to quiet this message.

Code
obtained_from_senda_professional_jun_2025<- 
SISTRAT23_c1_2010_2022_df_prev1h |> 
    mutate(concat= paste0(hash_key,"_",adm_date_rec)) |> 
    inner_join(mutate(maureen_casos_disch_miss, disch= readr::parse_date(`rescate FE`, format="%m/%d/%Y")), by="concat") |> tidytable::select(rn,hash_key, tr_compliance_rec, disch_date_rec0, disch_date_rec0_num, dit_rec1, concat, disch) |> 
    group_by(hash_key)|>
    mutate(
        corr_disch = if_else(
            n() == 1 & disch_date_rec0 == as.Date("2019-12-31"),
            disch,
            as.Date(NA)
        )
    )|> 
    ungroup() |> filter(!is.na(corr_disch))



#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
cat("Inclusion of information provided by SENDA professionals, june 2025")

SISTRAT23_c1_2010_2022_df_prev1h <-
  SISTRAT23_c1_2010_2022_df_prev1h |>
  (\(df) {
    cat(paste0("4.*. Database before correcting discharge dates, provided June 25, cases: ", formatC(nrow(df), big.mark=",")), "\n")
    cat(paste0("4.*. Database before correcting discharge dates, provided June 25, RUNs: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")), "\n")
    df
  })() |>
  left_join(obtained_from_senda_professional_jun_2025 |> select(rn, corr_disch), by = "rn") |>
  mutate(
    disch_date_rec0 = if_else(!is.na(corr_disch), corr_disch, disch_date_rec0),
    disch_date_rec0_num = unclass(disch_date_rec0),
    dit_rec1 = if_else(!is.na(corr_disch), disch_date_rec0_num - adm_date_rec_num, dit_rec1),
    #tr_compliance_rec = if_else(!is.na(corr_disch), NA_character_, tr_compliance_rec),
    OBS = if_else(!is.na(corr_disch), paste0(OBS, "; 4.*.replaced missing disch dates"), OBS)
  ) |>
  select(-corr_disch) |>
  (\(df) {
    cat(paste0("4.*. Database after correcting discharge dates, provided June 25, cases: ", formatC(nrow(df), big.mark=",")), "\n")
    cat(paste0("4.*. Database after correcting discharge dates, provided June 25, RUNs: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")), "\n")
    if (nrow(df) > nrow(SISTRAT23_c1_2010_2022_df_prev1h)) stop("Error: Added treatment episodes in the process")
    df
  })()
Format as most as possible the updated 2019 database in terms of datesTable of dates of discharge with days in treatment

2009-03-23 2009-10-01 2010-01-20 2010-04-03 2010-10-01 2019-11-05 2019-11-13 
         1          1          1          1          1        382        506 
2023-04-28 2023-05-03 
      3995          1 
We should discard dates previous to 2023-04-28 because they were part of the actuala dministrative truncation process

New discharge date from updated C1 2019 database (discarding discharges in 2024-04-28; COINCIDENCE by HASH & admission date), cases: 330 
New discharge date from updated C1 2019 database (discarding discharges in 2024-04-28; COINCIDENCE by HASH & admission date), RUNs: 330

Lets check yearly database origin. Where do they come from?...
 TABLE_rec   n percent
     20191 497       1
Corrected the database with updated discharge dates4.pre. Database before correcting discharge dates, cases: 150,187 
4.pre. Database before discarding discharge dates, RUNs: 106,283 
4.pre. Database after correcting discharge dates, cases: 150,187 
4.pre. Database after discarding discharge dates, RUNs: 106,283 
Tr compliance status of the updated records
Var1 n % mean median Q0.25 Q0.75
completion 59 17.9% 372.66 332.0 223.50 447.50
early adm discharge 2 0.6% 42.50 42.5 21.75 63.25
early dropout 63 19.1% 64.46 67.0 52.50 82.00
late adm discharge 12 3.6% 402.75 291.5 148.75 513.50
late dropout 162 49.1% 197.35 161.5 121.50 244.75
referral 32 9.7% 197.41 148.0 90.75 244.00
2025-06-05: Maureen saved some cases
Inclusion of information provided by SENDA professionals, june 20254.*. Database before correcting discharge dates, provided June 25, cases: 150,187 
4.*. Database before correcting discharge dates, provided June 25, RUNs: 106,283 
4.*. Database after correcting discharge dates, provided June 25, cases: 150,187 
4.*. Database after correcting discharge dates, provided June 25, RUNs: 106,283 

We corrected dates of discharge in numeric (disch_date_rec0_num) and date (disch_date_rec0) formats with “2019-12-31” and 18261. Also we corrected the days in treatment into dit_rec1. Finally, we recoded the tr_compliance variable to indicate that the treatment was truncated due to administrative reasons (tr_compliance_rec).

0. Rule-based deduplication

In order to find and delete duplicated data that does not add information relevant for the purposes of the study, we now may use these standardized variables as a criteria to achieve the goal of having a unique event per HASH, by reducing its complexity based on irrelevant differences.

0.a. Deduplication based on standardized columns of interest for the study

An analysis based on the following criteria, ended with an index of how many differences are within cases with the same HASH and date of admission, and in determining which variables can be tolerable to have differences. For example, if two or more cases share the same date of admission and hash, but most of the variables are different, it is possible to think that information may be lost if one of them is deleted. In another example, if two or more cases share the same date of admission and hash, but the only differences are observed in the days of treatment, one may think that only the case with more treatment days must be preserved.

  • hash_key= Masked Identifier (RUN)
  • region_del_centro= Chilean Region of the Center
  • dit_rec= Days of Treatment
  • adm_date_rec_num= Date of Admission to Treatment
  • disch_date= Date of Discharge from Treatment
  • id_centro= Treatment Center ID
  • codigo_identificacion = SENDA ID
  • adm_age_rec= Age at Admission to Treatment
  • age_subs_onset= Age of Onset of Drug Use
  • age_prim_subs_onset= Age of Onset of Drug Use Primary Substance
  • type_center= Type of Center
  • nacionalidad= Nationality
  • etnia= Ethnicity
  • diagnostico_trs_psiquiatrico_dsm_iv= Diagnosis of Psychiatric Disorders, DSM-IV criteria
  • diagnostico_trs_psiquiatrico_sub_dsm_iv= Diagnosis of Psychiatric Disorders, DSM-IV criteria (sub-classification)
  • x2_diagnostico_trs_psiquiatrico_dsm_iv= Diagnosis of Psychiatric Disorders, DSM-IV criteria (2)
  • x2_diagnostico_trs_psiquiatrico_sub_dsm_iv= Diagnosis of Psychiatric Disorders, DSM-IV criteria (sub-classification) (2)
  • x3_diagnostico_trs_psiquiatrico_dsm_iv= Diagnosis of Psychiatric Disorders, DSM-IV criteria (3)
  • x3_diagnostico_trs_psiquiatrico_sub_dsm_iv= Diagnosis of Psychiatric Disorders, DSM-IV criteria (sub-classification) (3)
  • diagnostico_trs_psiquiatrico_cie_10= Diagnosis of Psychiatric Disorders, CIE-10 criteria
  • diagnostico_trs_psiquiatrico_sub_cie_10= Diagnosis of Psychiatric Disorders, CIE-10 criteria (subclassification)
  • x2_diagnostico_trs_psiquiatrico_cie_10= Diagnosis of Psychiatric Disorders, CIE-10 criteria (2)
  • x2_diagnostico_trs_psiquiatrico_sub_cie_10= Diagnosis of Psychiatric Disorders, CIE-10 criteria (subclassification) (2)
  • x3_diagnostico_trs_psiquiatrico_cie_10= Diagnosis of Psychiatric Disorders, CIE-10 criteria (3)
  • x3_diagnostico_trs_psiquiatrico_sub_cie_10= Diagnosis of Psychiatric Disorders, CIE-10 criteria (subclassification) (3)
  • sub_dep_icd10_status= Drug dependence diagnosis
  • biopsych_comp= Biopsychosocial compromise
  • sexo= Sex of User
  • plan_type= Type of Plan
  • tipo_de_programa_2= Type of Program
  • tr_compliance= Cause of Discharge (with late and early withdrawal)
  • primary_sub= Primary or Main Substance of Consumption
  • second_sub1= Other Substances (1)
  • second_sub2= Other Substances (2)
  • second_sub3= Other Substances (3)
  • first_sub_used= Starting Substance
  • marital_status= Marital Status
  • occupation_condition= Occupational Status
  • occupation_status= Occupational Category
  • adm_motive= Motive of Admission to Treatment
  • ed_attainment= Educational Attainment
  • prim_sub_route= Route of Administration of the Primary or Main Substance
  • prim_sub_freq= Frequency of Consumption of the Primary or Main Substance
  • municipallity_res_cutpre18= Commune/municipallity of residence
Code
criterios_show<- c('hash_key', 'region_del_centro', 'dit_rec', 'adm_date_rec_num', 'disch_date', 'id_centro', 'codigo_identificacion ', 'adm_age_rec', 'age_subs_onset', 'age_prim_subs_onset', 'type_center', 'nacionalidad', 'etnia', 'diagnostico_trs_psiquiatrico_dsm_iv', 'diagnostico_trs_psiquiatrico_sub_dsm_iv', 'x2_diagnostico_trs_psiquiatrico_dsm_iv', 'x2_diagnostico_trs_psiquiatrico_sub_dsm_iv', 'x3_diagnostico_trs_psiquiatrico_dsm_iv', 'x3_diagnostico_trs_psiquiatrico_sub_dsm_iv', 'diagnostico_trs_psiquiatrico_cie_10', 'diagnostico_trs_psiquiatrico_sub_cie_10', 'x2_diagnostico_trs_psiquiatrico_cie_10', 'x2_diagnostico_trs_psiquiatrico_sub_cie_10', 'x3_diagnostico_trs_psiquiatrico_cie_10', 'x3_diagnostico_trs_psiquiatrico_sub_cie_10', 'sub_dep_icd10_status', 'biopsych_comp', 'sexo', 'plan_type', 'tipo_de_programa_2', 'tr_compliance', 'primary_sub', 'second_sub1', 'second_sub2', 'second_sub3', 'first_sub_used', 'marital_status', 'occupation_condition', 'occupation_status', 'adm_motive', 'ed_attainment', 'prim_sub_route', 'prim_sub_freq', 'municipallity_res_cutpre18')


#Duplicated entries
## according to DVG of 2020
### SENDA yes vs. no
### Earlier database
### More treatment days
### No discharge dates missing
### cases in study?. out
### cases with greater frequency of substance use

This section is no longer pertinent, as we have excluded patients with records that share identical admission dates.


0.b. Deduplication from the Overlap Between Dates of Admission & Discharge

Once the duplicated cases were discarded, we searched for cases in which dates ranges were overlapped with other treatments for the same user (HASH). To search different overlappings, we had to temporarily replace those cases that did not have a date of discharge, with the date of retrieval of the datasets that was “2024-04-28” [disch_date_num_miss] (dates are in the format “years-month-day” in this document).

Code
# dias_en_tratamiento
CONS_C1_df_dup_intervals<- 
  SISTRAT23_c1_2010_2022_df_prev1h|>
    mutate(disch_date_num_miss= ifelse(is.na(disch_date_rec0_num), 19475, disch_date_rec0_num))|> #equivalente a 2023-04-28 as.numeric(as.Date("2023-01-01"))
    rename("hash_key_2"="hash_key", "rn2"="rn")|>
    select(rn2, hash_key_2, TABLE, adm_age_rec, adm_date_rec, adm_date_rec_num , disch_date_rec0, disch_date_num_miss, dit_rec1, id_centro, tr_compliance_rec, plan_type, senda)|> 
    #dplyr::filter(motivodeegreso!="Derivación")|>
    data.table::as.data.table()
  
overlap_dates_C1 <- janitor::clean_names(
    sqldf::sqldf(
      "
      SELECT *
      FROM CONS_C1_df_dup_intervals AS x
      INNER JOIN CONS_C1_df_dup_intervals AS y 
      ON x.hash_key_2 = y.hash_key_2 
         AND x.rn2 < y.rn2  -- Avoids duplicates (eg.: x vs y and then y vs x)
         AND x.adm_date_rec_num < y.disch_date_num_miss  -- x Admitted before being admitted into another treatment
         AND x.disch_date_num_miss > y.adm_date_rec_num  -- x Discharged after being admitted in other
         "
    ))  |>
    `colnames<-`(c("rn_1", "hash_key_1", "ano_bd_1", "adm_age_1", "adm_date_1", "adm_date_rec_num_1", "disch_date_1", "disch_date_num_1", "dit_1", "id_centro_1", "tr_compliance_1", "plan_type_1", "senda_1",  "rn_2", "hash_key_2", "ano_bd_2", "adm_age_2", "adm_date_2", "adm_date_rec_num_2", "disch_date_2", "disch_date_num_2", "dit_2", "id_centro_2", "tr_compliance_2", "plan_type_2", "senda_2")) 
  
    cat(paste0("Number of overlapped dates, observations: ", nrow(overlap_dates_C1)),"\n")
    cat(paste0("Number of overlapped dates, RUNs: ", nrow(distinct(overlap_dates_C1, hash_key_1))))
    #Number of overlapped dates, observations: 1554 june 2025;  1562; march 2025 1659 ; in 2020, 1,448
    #Number of overlapped dates, RUNs: 1413 june 2025; 1420; march 2025 1491; in 2020, 173

#The rows on the left originate from older databases.
CONS_C1_df_dup_overlaps_COMP <- 
as_tidytable(overlap_dates_C1)|>
  mutate(pair_id= paste0(rn_1,"_",rn_2))|> 
  mutate(same_id=ifelse(id_centro_1==id_centro_2,1,0))|>
  mutate(bd_2_earlier=ifelse(ano_bd_2>ano_bd_1,1,0))|> #es el dato de la derecha de una base de datos mas reciente.
  mutate(senda_status= case_when(senda_1=="si" & senda_2=="si"~ "both yes", senda_1=="no" & senda_2=="no"~ "both no", senda_1=="no" & senda_2=="si"~ "second yes", senda_1=="no" & senda_2=="no"~ "second no",  T~NA_character_))|>
  mutate(referral= ifelse(tr_compliance_1=="referral",1,0))|>
  mutate(days_overlapped=disch_date_num_1-adm_date_rec_num_2)|> # para que hayan dias positivos. Se supone que la fecha de egreso es más reciente que la fecha de ingreso del evento que superpone.
  mutate(more_dit=ifelse(dit_2>dit_1,1,0))|> #más días tratado en 2
  mutate(trat_1_within_2=ifelse(disch_date_num_1<disch_date_num_2 & adm_date_rec_num_1>adm_date_rec_num_2,1,0))|>
  select(-hash_key_2) |> 
  rename("hash_key"="hash_key_1")

####
CONS_C1_df_dup_overlaps_COMP|>
      (\(df) {
        mutate(df, hash_key= as.numeric(factor(hash_key)))|>  rio::export("_out/_overlaps_dup_step_2.xlsx") #for visual comparison in excel
        knitr::kable(filter(df, hash_key %in% pull(sample_n_with_seed(CONS_C1_df_dup_overlaps_COMP,20, seed=2125),"hash_key"))|> mutate(hash_key= as.numeric(factor(hash_key))), format = "html", format.args = list(decimal.mark = ".", big.mark = ","),
               caption="Cases with overlapped treatment ranges", align = rep('c', 32),  
               #col.names = c("Row No.(1)", "HASH", "Year of\nDataset(1)", "Admission age(1)", "Admission\ndate(1)(a)", "Admission\ndate(1)(b)", "Discharge\ndate(1)(a)", "Discharge\ndate(1)(b)", "Treatment Days(1)", "Center ID(1)", "Cause of\nDischarge(1)", "Plan Type(1)", "SENDA(1)", "Row No.(2)", "Year of\nDataset(2)", "Admission age(2)", "Admission\ndate(2)(a)", "Admission\ndate(2)(b)", "Discharge\ndate(2)(a)", "Discharge\ndate(2)(b)", "Treatment Days(2)", "Center ID(2)", "Cause of\nDischarge(2)", "Plan Type(2)", "SENDA(2)", "Same Center ID", "Earlier Dataset \nof 2nd Treatment", "Financed \nBy SENDA", "Referral", "Days Overlapped", "2nd treatment has more treatment days", "1st treatment\nabsorbs 2nd")
  ) |>
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover"),font_size = 8)|>
  kableExtra::add_footnote( c("Note.Each row represents an overlap. Variables ending with '_1' are the first case, and variables ending with '_2' correspond to the second case;", "a= date; b= numeric", "Same Center ID= If both cases share the same Center ID",  "Financed By SENDA= If both cases are financed by SENDA;", "Referral= If the cause of discharge is the referral from another center (1= Referral);","Days Overlapped= Difference between the date of admission of the earlier treatment, and the date of discharge of the latter treatment","2nd treatment has more treatment days= Earlier treatment has more days of treatment"), notation = "none")|>
  kableExtra::scroll_box(width = "100%", height = "375px")
      })()
Number of overlapped dates, observations: 1554 
Number of overlapped dates, RUNs: 1413
Cases with overlapped treatment ranges
rn_1 hash_key ano_bd_1 adm_age_1 adm_date_1 adm_date_rec_num_1 disch_date_1 disch_date_num_1 dit_1 id_centro_1 tr_compliance_1 plan_type_1 senda_1 rn_2 ano_bd_2 adm_age_2 adm_date_2 adm_date_rec_num_2 disch_date_2 disch_date_num_2 dit_2 id_centro_2 tr_compliance_2 plan_type_2 senda_2 pair_id same_id bd_2_earlier senda_status referral days_overlapped more_dit trat_1_within_2
49,390 1 2014 35.81 2013-06-17 15,873 2014-01-27 16,097 224 166 late dropout pg-pab si 53,962 2014 36.35 2013-12-30 16,069 2014-01-03 16,073 4 163 early dropout m-pr si 49390_53962 0 0 both yes 0 28 0 0
64,649 2 2014 37.92 2014-11-11 16,385 2014-12-01 16,405 20 212 referral pg-pai si 70,654 2015 37.97 2014-11-28 16,402 2014-12-31 16,435 33 216 referral pg-pr si 64649_70654 0 1 both yes 1 3 1 0
62,072 3 2014 65.68 2014-09-25 16,338 2014-12-02 16,406 68 262 referral m-pai si 70,640 2015 65.87 2014-12-01 16,405 2015-10-13 16,721 316 258 late adm discharge m-pr si 62072_70640 0 1 both yes 1 1 1 0
18,306 4 2011 23.71 2011-08-10 15,196 2011-09-30 15,247 51 259 completion pg-pai si 22,738 2012 23.84 2011-09-28 15,245 2012-04-25 15,455 210 257 referral pg-pr si 18306_22738 0 1 both yes 0 2 1 0
21,516 5 2012 28.45 2011-06-06 15,131 2012-04-27 15,457 326 262 referral m-pai si 26,680 2012 29.33 2012-04-25 15,455 2012-06-12 15,503 48 275 referral m-pr si 21516_26680 0 0 both yes 1 2 0 0
28,059 5 2012 29.49 2012-06-21 15,512 2012-10-05 15,618 106 262 referral m-pai si 48,008 2014 29.78 2012-10-04 15,617 2014-04-01 16,161 544 302 completion m-pr si 28059_48008 0 1 both yes 1 1 1 0
34,898 6 2013 40.94 2012-11-05 15,649 2014-01-29 16,099 450 328 late dropout pg-pab si 53,984 2014 42.14 2014-01-20 16,090 2014-05-06 16,196 106 502 late dropout pg-pab si 34898_53984 0 1 both yes 0 9 0 0
75,748 7 2015 21.66 2015-03-24 16,518 2015-07-09 16,625 107 181 referral pg-pai si 79,451 2015 21.95 2015-07-06 16,622 2015-09-14 16,692 70 189 completion m-pr si 75748_79451 0 0 both yes 1 3 0 0
60,659 8 2014 39.43 2014-07-11 16,262 2014-09-12 16,325 63 141 early dropout pg-pab si 62,156 2014 39.58 2014-09-01 16,314 2014-11-25 16,399 85 141 early dropout pg-pab si 60659_62156 1 0 both yes 0 11 1 0
43,322 9 2013 27.85 2013-08-26 15,943 2013-12-20 16,059 116 209 referral pg-pab si 52,869 2014 28.16 2013-12-17 16,056 2014-03-01 16,130 74 432 early dropout m-pr si 43322_52869 0 1 both yes 1 3 0 0
48,546 10 2014 31.32 2013-03-11 15,775 2014-04-21 16,181 406 294 late dropout pg-pab si 59,129 2014 32.15 2014-01-06 16,076 2014-11-12 16,386 310 294 late dropout pg-pai si 48546_59129 1 0 both yes 0 105 0 0
51,732 11 2014 42.85 2013-11-04 16,013 2014-03-21 16,150 137 290 referral pg-pai si 66,455 2015 43.15 2014-02-24 16,125 2015-06-12 16,598 473 303 completion pg-pr si 51732_66455 0 1 both yes 1 25 1 0
24,214 12 2012 37.88 2012-01-31 15,370 2012-03-05 15,404 34 163 early dropout pg-pr si 33,099 2013 36.14 2010-05-07 14,736 2013-05-31 15,856 1,120 166 late dropout pg-pab si 24214_33099 0 1 both yes 0 668 1 1
64,530 13 2014 59.30 2014-11-04 16,378 2014-11-05 16,379 1 early adm discharge pg-pr si 86,673 2016 59.28 2014-10-27 16,370 2016-03-29 16,889 519 105 completion pg-pai si 64530_86673 1 both yes 0 9 1 1
75,380 14 2015 23.71 2015-04-02 16,527 2015-08-26 16,673 146 441 referral pg-pai si 89,030 2016 24.09 2015-08-18 16,665 2016-04-01 16,892 227 650 late dropout pg-pr si 75380_89030 0 1 both yes 1 8 1 0
127,231 15 2018 24.89 2017-04-11 17,267 2018-04-24 17,645 378 119 referral pg-pai si 136,733 2018 25.89 2018-04-11 17,632 2018-09-01 17,775 143 117 late dropout pg-pr si 127231_136733 0 0 both yes 1 13 0 0
105,585 16 2017 33.74 2015-02-18 16,484 2017-01-31 17,197 713 288 referral pg-pab si 112,500 2017 35.65 2017-01-16 17,182 2017-08-04 17,382 200 357 completion pg-pr si 105585_112500 0 0 both yes 1 15 0 0
25,237 17 2012 38.88 2012-03-06 15,405 2012-04-12 15,442 37 171 referral pg-pai si 65,490 2015 37.64 2010-12-10 14,953 2015-05-29 16,584 1,631 166 completion pg-pai si 25237_65490 0 1 both yes 1 489 1 1
33,552 17 2013 39.29 2012-08-01 15,553 2013-02-01 15,737 184 171 referral pg-pai si 65,490 2015 37.64 2010-12-10 14,953 2015-05-29 16,584 1,631 166 completion pg-pai si 33552_65490 0 1 both yes 1 784 1 1
25,522 18 2012 21.02 2012-01-27 15,366 2012-03-30 15,429 63 225 referral pg-pai si 25,941 2012 21.18 2012-03-26 15,425 2012-06-30 15,521 96 235 late dropout pg-pr si 25522_25941 0 0 both yes 1 4 1 0
210,165 19 2022 39.30 2021-11-11 18,942 2022-07-20 19,193 251 703 referral pg-pai si 217,904 2022 39.98 2022-07-18 19,191 19,475 162 currently in pg-pr si 210165_217904 0 0 both yes 1 2 0
52,331 20 2014 54.02 2013-12-03 16,042 2014-03-31 16,160 118 365 referral pg-pai si 66,690 2015 54.29 2014-03-14 16,143 2015-02-02 16,468 325 179 completion pg-pr si 52331_66690 0 1 both yes 1 17 1 0
Note.Each row represents an overlap. Variables ending with '_1' are the first case, and variables ending with '_2' correspond to the second case;
a= date; b= numeric
Same Center ID= If both cases share the same Center ID
Financed By SENDA= If both cases are financed by SENDA;
Referral= If the cause of discharge is the referral from another center (1= Referral);
Days Overlapped= Difference between the date of admission of the earlier treatment, and the date of discharge of the latter treatment
2nd treatment has more treatment days= Earlier treatment has more days of treatment

We identified 1,554 overlappings. Some of the users appeared more than once (n= 95); those users may have competing dates of discharge, which will have to be chosen based on their individual trajectories.

0.b.0 Multiple overlappings

We first focused on cases that had multiple overlappings. These will be revised latter.

Code
# c("rn_1", "hash_key_1", "ano_bd_1", "adm_age_1", "adm_date_1", "adm_date_rec_num_1", "disch_date_1", "disch_date_num_1", "dit_1", "id_centro_1", "tr_compliance_1", "plan_type_1", "senda_1",  "rn_2", "hash_key_2", "ano_bd_2", "adm_age_2", "adm_date_2", "adm_date_rec_num_2", "disch_date_2", "disch_date_num_2", "dit_2", "id_centro_2", "tr_compliance_2", "plan_type_2", "senda_2")

overlaps_after_miss_appear_more_than_one_time<-
CONS_C1_df_dup_overlaps_COMP |>
    tidytable::pivot_longer(
        cols = matches("_[12]$"),  # All columns ending with _1 or _2
        names_to = c(".value", "wave"),
        names_pattern = "(.+)_([12])",
        values_drop_na = FALSE) |> 
    group_by(rn) |> 
    count() |> 
filter(n>1) |> pull(rn)

multiple_overlaps <-
        CONS_C1_df_dup_overlaps_COMP |> filter(rn_1 %in% overlaps_after_miss_appear_more_than_one_time|rn_2 %in% overlaps_after_miss_appear_more_than_one_time)|>
      (\(df) {
    cat(paste0("00. Multiple overlappings, cases: ", formatC(nrow(df), big.mark=",")),"\n")
    cat(paste0("00. Multiple overlappings, RUNs: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
    df
  })()|> 
  tidytable::pivot_longer(
        cols = matches("_[12]$"),  # All columns ending with _1 or _2
        names_to = c(".value", "wave"),
        names_pattern = "(.+)_([12])",
        values_drop_na = FALSE)|> 
  filter(dit<1095, senda!="no")|> 
  group_by(hash_key)|> 
      mutate(
        max_ano_bd = max(ano_bd, na.rm = TRUE),
        max_disch_date_num = max(disch_date_num, na.rm = TRUE)
    )|> 
    # 1. Prioritize completed treatments
  arrange(
    # 2. Then longest duration
    desc(dit),
    # 3. Then most recent retrieval year of the database
    desc(max_ano_bd),
    # 4. Then most recent discharge date
    desc(max_disch_date_num)
  )|> 
  # Keep only the top-ranked row per group
  slice(1)
# 
# 00. Multiple overlappings, cases: 221; june 2025 174 
# 00. Multiple overlappings, RUNs: 87; june 2025 67
invisible("These rules are too simplistic. I did not use them")

CONS_C1_df_dup_overlaps_COMP|> filter(rn_1 %in% overlaps_after_miss_appear_more_than_one_time|rn_2 %in% overlaps_after_miss_appear_more_than_one_time)|>
      (\(df) {
  rio::export(df, "_out/_multiple_overlappings.xlsx") #for visual comparison in excel
        knitr::kable(mutate(df, hash_key= as.numeric(factor(hash_key))), format = "html", format.args = list(decimal.mark = ".", big.mark = ","), caption="Cases with multiple overlappings", align = rep('c', 32)  
               #col.names = c("Row No.(1)", "HASH", "Year of\nDataset(1)", "Admission age(1)", "Admission\ndate(1)(a)", "Admission\ndate(1)(b)", "Discharge\ndate(1)(a)", "Discharge\ndate(1)(b)", "Treatment Days(1)", "Center ID(1)", "Cause of\nDischarge(1)", "Plan Type(1)", "SENDA(1)", "Row No.(2)", "Year of\nDataset(2)", "Admission age(2)", "Admission\ndate(2)(a)", "Admission\ndate(2)(b)", "Discharge\ndate(2)(a)", "Discharge\ndate(2)(b)", "Treatment Days(2)", "Center ID(2)", "Cause of\nDischarge(2)", "Plan Type(2)", "SENDA(2)", "Same Center ID", "Earlier Dataset \nof 2nd Treatment", "Financed \nBy SENDA", "Referral", "Days Overlapped", "2nd treatment has more treatment days", "1st treatment\nabsorbs 2nd")
  ) |>
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover"),font_size = 8)|>
  kableExtra::add_footnote( c("Note.Each row represents an overlap. Variables ending with '_1' are the first case, and variables ending with '_2' correspond to the second case;", "a= date; b= numeric", "Same Center ID= If both cases share the same Center ID",  "Financed By SENDA= If both cases are financed by SENDA;", "Referral= If the cause of discharge is the referral from another center (1= Referral);","Days Overlapped= Difference between the date of admission of the earlier treatment, and the date of discharge of the latter treatment","2nd treatment has more treatment days= Earlier treatment has more days of treatment"), notation = "none")|>
  kableExtra::scroll_box(width = "100%", height = "375px")
      })()
00. Multiple overlappings, cases: 174 
00. Multiple overlappings, RUNs: 67 
Cases with multiple overlappings
rn_1 hash_key ano_bd_1 adm_age_1 adm_date_1 adm_date_rec_num_1 disch_date_1 disch_date_num_1 dit_1 id_centro_1 tr_compliance_1 plan_type_1 senda_1 rn_2 ano_bd_2 adm_age_2 adm_date_2 adm_date_rec_num_2 disch_date_2 disch_date_num_2 dit_2 id_centro_2 tr_compliance_2 plan_type_2 senda_2 pair_id same_id bd_2_earlier senda_status referral days_overlapped more_dit trat_1_within_2
71,738 1 2015 26.52 2014-06-30 16,251 2015-07-08 16,624 373 294 referral pg-pab si 78,750 2015 27.52 2015-06-30 16,616 2016-02-01 16,832 216 559 late dropout pg-pai si 71738_78750 0 0 both yes 1 8 0 0
61,433 1 2014 26.65 2014-08-18 16,300 2014-11-25 16,399 99 297 completion pg-pr si 71,738 2015 26.52 2014-06-30 16,251 2015-07-08 16,624 373 294 referral pg-pab si 61433_71738 0 1 both yes 0 148 1 1
97,268 2 2016 25.85 2016-05-24 16,945 2016-09-08 17,052 107 161 late dropout pg-pai si 126,406 2018 26.12 2016-08-30 17,043 2018-08-01 17,744 701 615 completion pg-pab si 97268_126406 0 1 both yes 0 9 1 0
103,121 2 2016 26.27 2016-10-24 17,098 2016-12-01 17,136 38 161 early dropout pg-pai si 126,406 2018 26.12 2016-08-30 17,043 2018-08-01 17,744 701 615 completion pg-pab si 103121_126406 0 1 both yes 0 93 1 1
35,598 3 2013 32.61 2013-01-02 15,707 2013-01-25 15,730 23 161 referral pg-pai si 36,801 2013 32.57 2012-12-20 15,694 2013-05-30 15,855 161 161 late dropout pg-pai si 35598_36801 1 0 both yes 1 36 1 1
35,933 3 2013 32.67 2013-01-25 15,730 2013-01-27 15,732 2 289 early dropout pg-pr si 36,801 2013 32.57 2012-12-20 15,694 2013-05-30 15,855 161 161 late dropout pg-pai si 35933_36801 0 0 both yes 0 38 1 1
210,000 4 2022 33.96 2021-10-18 18,918 19,475 415 currently in pg-pab si 214,345 2022 34.38 2022-03-22 19,073 2022-05-24 19,136 63 248 referral pg-pai si 210000_214345 0 0 both yes 0 402 0
210,000 4 2022 33.96 2021-10-18 18,918 19,475 415 currently in pg-pab si 215,964 2022 34.56 2022-05-25 19,137 19,475 266 currently in pg-pr si 210000_215964 0 0 both yes 0 338 0
5,037 5 2010 31.71 2009-03-04 14,307 2010-05-31 14,760 453 109 referral pg-pai si 6,345 2010 31.73 2009-03-12 14,315 2010-12-31 14,974 659 109 referral pg-pab si 5037_6345 1 0 both yes 1 445 1 0
5,037 5 2010 31.71 2009-03-04 14,307 2010-05-31 14,760 453 109 referral pg-pai si 10,895 2011 31.74 2009-03-15 14,318 2011-03-31 15,064 746 109 referral pg-pai no 5037_10895 1 1 1 442 1 0
5,037 5 2010 31.71 2009-03-04 14,307 2010-05-31 14,760 453 109 referral pg-pai si 16,311 2011 31.71 2009-03-03 14,306 2011-12-01 15,309 1,003 109 referral pg-pab si 5037_16311 1 1 both yes 1 454 1 1
6,345 5 2010 31.73 2009-03-12 14,315 2010-12-31 14,974 659 109 referral pg-pab si 10,895 2011 31.74 2009-03-15 14,318 2011-03-31 15,064 746 109 referral pg-pai no 6345_10895 1 1 1 656 1 0
6,345 5 2010 31.73 2009-03-12 14,315 2010-12-31 14,974 659 109 referral pg-pab si 16,311 2011 31.71 2009-03-03 14,306 2011-12-01 15,309 1,003 109 referral pg-pab si 6345_16311 1 1 both yes 1 668 1 1
10,895 5 2011 31.74 2009-03-15 14,318 2011-03-31 15,064 746 109 referral pg-pai no 16,311 2011 31.71 2009-03-03 14,306 2011-12-01 15,309 1,003 109 referral pg-pab si 10895_16311 1 0 second yes 1 758 1 1
22,218 6 2012 27.55 2011-03-04 15,037 2012-01-20 15,359 322 106 late dropout pg-pab si 48,023 2014 27.56 2011-03-10 15,043 2014-06-30 16,251 1,208 106 completion pg-pab si 22218_48023 1 1 both yes 0 316 1 0
15,745 6 2011 27.56 2011-03-11 15,044 2011-06-02 15,127 83 106 early dropout pg-pab si 22,218 2012 27.55 2011-03-04 15,037 2012-01-20 15,359 322 106 late dropout pg-pab si 15745_22218 1 1 both yes 0 90 1 1
15,745 6 2011 27.56 2011-03-11 15,044 2011-06-02 15,127 83 106 early dropout pg-pab si 48,023 2014 27.56 2011-03-10 15,043 2014-06-30 16,251 1,208 106 completion pg-pab si 15745_48023 1 1 both yes 0 84 1 1
29,888 7 2012 40.20 2012-08-22 15,574 2012-09-28 15,611 37 109 early adm discharge pg-pab si 30,707 2012 40.24 2012-09-07 15,590 2012-12-20 15,694 104 109 referral pg-pai si 29888_30707 1 0 both yes 0 21 1 0
30,707 7 2012 40.24 2012-09-07 15,590 2012-12-20 15,694 104 109 referral pg-pai si 37,147 2013 40.50 2012-12-12 15,686 2013-11-15 16,024 338 117 late dropout pg-pr si 30707_37147 0 1 both yes 1 8 1 0
6,593 8 2010 40.92 2010-07-09 14,799 2019-12-31 18,261 3,462 278 adm truncated pg-pab si 48,343 2014 43.43 2013-01-11 15,716 2014-05-29 16,219 503 329 late dropout pg-pai si 6593_48343 0 1 both yes 0 2,545 0 0
6,593 8 2010 40.92 2010-07-09 14,799 2019-12-31 18,261 3,462 278 adm truncated pg-pab si 67,665 2015 44.90 2014-07-01 16,252 2015-07-14 16,630 378 329 late adm discharge pg-pai si 6593_67665 0 1 both yes 0 2,009 0 0
6,593 8 2010 40.92 2010-07-09 14,799 2019-12-31 18,261 3,462 278 adm truncated pg-pab si 134,162 2018 48.49 2018-02-01 17,563 2018-12-27 17,892 329 601 referral pg-pai si 6593_134162 0 1 both yes 0 698 0 0
70,143 9 2015 28.45 2014-11-04 16,378 2015-07-03 16,619 241 178 referral m-pai si 79,375 2015 29.11 2015-07-01 16,617 2015-10-08 16,716 99 179 late adm discharge pg-pr si 70143_79375 0 0 both yes 1 2 0 0
79,375 9 2015 29.11 2015-07-01 16,617 2015-10-08 16,716 99 179 late adm discharge pg-pr si 90,320 2016 29.37 2015-10-05 16,713 2016-12-01 17,136 423 178 late adm discharge m-pai si 79375_90320 0 1 both yes 0 3 1 0
2,201 10 2010 25.97 2009-11-16 14,564 2010-01-11 14,620 56 297 early dropout pg-pr si 6,996 2010 24.90 2008-10-20 14,172 2010-10-18 14,900 728 161 completion pg-pab no 2201_6996 0 0 0 448 1 1
6,402 10 2010 26.63 2010-07-14 14,804 2010-10-04 14,886 82 161 early adm discharge pg-pai no 6,996 2010 24.90 2008-10-20 14,172 2010-10-18 14,900 728 161 completion pg-pab no 6402_6996 1 0 both no 0 714 1 1
12,461 11 2011 34.55 2010-12-07 14,950 2011-03-30 15,063 113 171 referral pg-pai si 38,473 2013 34.80 2011-03-08 15,041 2013-06-21 15,877 836 166 referral pg-pai si 12461_38473 0 1 both yes 1 22 1 0
38,473 11 2013 34.80 2011-03-08 15,041 2013-06-21 15,877 836 166 referral pg-pai si 41,391 2013 37.08 2013-06-19 15,875 2013-11-20 16,029 154 163 referral pg-pr si 38473_41391 0 0 both yes 1 2 0 0
28,802 11 2012 36.19 2012-07-30 15,551 2012-08-06 15,558 7 171 referral pg-pai si 38,473 2013 34.80 2011-03-08 15,041 2013-06-21 15,877 836 166 referral pg-pai si 28802_38473 0 1 both yes 1 517 1 1
62,746 12 2014 32.45 2014-10-07 16,350 2015-01-26 16,461 111 124 referral pg-pab si 62,885 2014 32.44 2014-10-01 16,344 2014-12-01 16,405 61 124 early dropout pg-pab no 62746_62885 1 0 1 117 0 0
62,746 12 2014 32.45 2014-10-07 16,350 2015-01-26 16,461 111 124 referral pg-pab si 71,596 2015 32.75 2015-01-22 16,457 2015-04-01 16,526 69 434 early dropout pg-pab si 62746_71596 0 1 both yes 1 4 0 0
92,857 13 2016 34.88 2016-01-20 16,820 2016-06-17 16,969 149 408 late adm discharge pg-pai si 95,989 2016 34.91 2016-02-01 16,832 2016-05-04 16,925 93 408 referral pg-pai no 92857_95989 1 0 0 137 0 0
92,857 13 2016 34.88 2016-01-20 16,820 2016-06-17 16,969 149 408 late adm discharge pg-pai si 107,824 2017 35.17 2016-05-05 16,926 2017-04-28 17,284 358 189 completion m-pr si 92857_107824 0 1 both yes 0 43 1 0
78,289 14 2015 35.03 2015-06-18 16,604 2015-07-28 16,644 40 141 referral pg-pai si 80,895 2015 35.02 2015-06-12 16,598 2015-11-26 16,765 167 141 late dropout pg-pai si 78289_80895 1 0 both yes 1 46 1 1
79,294 14 2015 35.14 2015-07-28 16,644 2015-08-10 16,657 13 142 referral m-pr si 80,895 2015 35.02 2015-06-12 16,598 2015-11-26 16,765 167 141 late dropout pg-pai si 79294_80895 0 0 both yes 1 59 1 1
65,616 15 2015 52.83 2012-04-09 15,439 2015-07-23 16,639 1,200 166 referral pg-pab si 88,257 2016 56.11 2015-07-20 16,636 2016-03-02 16,862 226 163 completion m-pr si 65616_88257 0 1 both yes 1 3 0 0
29,337 15 2012 53.16 2012-08-07 15,559 2012-09-03 15,586 27 104 completion m-pr si 65,616 2015 52.83 2012-04-09 15,439 2015-07-23 16,639 1,200 166 referral pg-pab si 29337_65616 0 1 both yes 0 147 1 1
35,046 15 2013 53.41 2012-11-06 15,650 2013-04-04 15,799 149 171 completion pg-pai si 65,616 2015 52.83 2012-04-09 15,439 2015-07-23 16,639 1,200 166 referral pg-pab si 35046_65616 0 1 both yes 0 360 1 1
6,791 16 2010 27.28 2009-10-23 14,540 2010-12-31 14,974 434 122 completion pg-pab si 12,841 2011 27.26 2009-10-15 14,532 2011-05-31 15,125 593 122 completion m-pai si 6791_12841 1 1 both yes 0 442 1 1
6,105 16 2010 27.93 2010-06-17 14,777 2010-07-01 14,791 14 122 early adm discharge pg-pab no 6,791 2010 27.28 2009-10-23 14,540 2010-12-31 14,974 434 122 completion pg-pab si 6105_6791 1 0 second yes 0 251 1 1
6,105 16 2010 27.93 2010-06-17 14,777 2010-07-01 14,791 14 122 early adm discharge pg-pab no 12,841 2011 27.26 2009-10-15 14,532 2011-05-31 15,125 593 122 completion m-pai si 6105_12841 1 1 second yes 0 259 1 1
12,180 17 2011 52.72 2008-12-10 14,223 2012-01-31 15,370 1,147 109 referral pg-pai si 25,660 2012 53.02 2009-03-31 14,334 2013-01-31 15,736 1,402 109 late dropout pg-pai si 12180_25660 1 1 both yes 1 1,036 1 0
12,180 17 2011 52.72 2008-12-10 14,223 2012-01-31 15,370 1,147 109 referral pg-pai si 48,425 2014 53.94 2010-03-01 14,669 2014-09-30 16,343 1,674 109 completion pg-pab si 12180_48425 1 1 both yes 1 701 1 0
8,809 17 2010 52.72 2008-12-12 14,225 2010-10-31 14,913 688 109 referral pg-pab si 12,180 2011 52.72 2008-12-10 14,223 2012-01-31 15,370 1,147 109 referral pg-pai si 8809_12180 1 1 both yes 1 690 1 1
8,809 17 2010 52.72 2008-12-12 14,225 2010-10-31 14,913 688 109 referral pg-pab si 25,660 2012 53.02 2009-03-31 14,334 2013-01-31 15,736 1,402 109 late dropout pg-pai si 8809_25660 1 1 both yes 1 579 1 0
8,809 17 2010 52.72 2008-12-12 14,225 2010-10-31 14,913 688 109 referral pg-pab si 48,425 2014 53.94 2010-03-01 14,669 2014-09-30 16,343 1,674 109 completion pg-pab si 8809_48425 1 1 both yes 1 244 1 0
25,660 17 2012 53.02 2009-03-31 14,334 2013-01-31 15,736 1,402 109 late dropout pg-pai si 48,425 2014 53.94 2010-03-01 14,669 2014-09-30 16,343 1,674 109 completion pg-pab si 25660_48425 1 1 both yes 0 1,067 1 0
20,109 18 2011 59.00 2011-11-15 15,293 2015-01-02 16,437 1,144 236 late adm discharge pg-pr no 62,620 2014 61.88 2014-10-01 16,344 2014-11-26 16,400 56 238 referral pg-pab no 20109_62620 0 1 both no 0 93 0 0
20,109 18 2011 59.00 2011-11-15 15,293 2015-01-02 16,437 1,144 236 late adm discharge pg-pr no 72,297 2015 62.01 2014-11-18 16,392 2015-08-10 16,657 265 258 completion pg-pr si 20109_72297 0 1 second yes 0 45 0 0
62,620 18 2014 61.88 2014-10-01 16,344 2014-11-26 16,400 56 238 referral pg-pab no 72,297 2015 62.01 2014-11-18 16,392 2015-08-10 16,657 265 258 completion pg-pr si 62620_72297 0 1 second yes 1 8 1 0
29,467 19 2012 35.58 2009-09-15 14,502 2012-11-05 15,649 1,147 123 referral pg-pab si 34,820 2013 38.65 2012-10-10 15,623 2013-10-21 15,999 376 117 late dropout pg-pr si 29467_34820 0 1 both yes 1 26 0 0
21,597 19 2012 37.37 2011-06-30 15,155 2012-07-25 15,546 391 123 late dropout pg-pab si 29,467 2012 35.58 2009-09-15 14,502 2012-11-05 15,649 1,147 123 referral pg-pab si 21597_29467 1 0 both yes 0 1,044 1 1
45,610 20 2013 38.05 2013-10-04 15,982 2013-12-18 16,057 75 261 referral pg-pai si 52,801 2014 38.24 2013-12-12 16,051 2014-10-01 16,344 293 258 completion m-pr si 45610_52801 0 1 both yes 1 6 1 0
52,801 20 2014 38.24 2013-12-12 16,051 2014-10-01 16,344 293 258 completion m-pr si 70,544 2015 39.04 2014-09-30 16,343 2015-05-27 16,582 239 261 late dropout pg-pai si 52801_70544 0 1 both yes 0 1 0 0
336 21 2010 21.88 2007-07-17 13,711 2010-02-16 14,656 945 118 referral pg-pai si 3,914 2010 23.75 2009-05-29 14,393 2011-01-05 14,979 586 118 referral pg-pab si 336_3914 1 0 both yes 1 263 0 0
336 21 2010 21.88 2007-07-17 13,711 2010-02-16 14,656 945 118 referral pg-pai si 12,792 2011 24.35 2010-01-04 14,613 2012-02-01 15,371 758 118 late adm discharge pg-pai si 336_12792 1 1 both yes 1 43 0 0
3,914 21 2010 23.75 2009-05-29 14,393 2011-01-05 14,979 586 118 referral pg-pab si 12,792 2011 24.35 2010-01-04 14,613 2012-02-01 15,371 758 118 late adm discharge pg-pai si 3914_12792 1 1 both yes 1 366 1 0
23,666 22 2012 44.38 2012-01-13 15,352 2012-06-14 15,505 153 142 completion pg-pr si 38,062 2013 44.04 2011-09-12 15,229 2013-05-27 15,852 623 291 completion pg-pai si 23666_38062 0 1 both yes 0 276 1 1
27,925 22 2012 44.80 2012-06-15 15,506 2012-07-06 15,527 21 142 referral m-pr si 38,062 2013 44.04 2011-09-12 15,229 2013-05-27 15,852 623 291 completion pg-pai si 27925_38062 0 1 both yes 1 298 1 1
28,306 22 2012 44.87 2012-07-09 15,530 2012-10-30 15,643 113 160 late adm discharge m-pai si 38,062 2013 44.04 2011-09-12 15,229 2013-05-27 15,852 623 291 completion pg-pai si 28306_38062 0 1 both yes 0 414 1 1
31,548 22 2012 45.23 2012-11-19 15,663 2013-03-11 15,775 112 291 late dropout pg-pai si 38,062 2013 44.04 2011-09-12 15,229 2013-05-27 15,852 623 291 completion pg-pai si 31548_38062 1 1 both yes 0 546 1 1
12,972 23 2011 21.38 2011-01-24 14,998 2011-06-15 15,140 142 238 late dropout pg-pab si 21,856 2012 21.73 2011-06-02 15,127 2013-02-01 15,737 610 246 completion pg-pai si 12972_21856 0 1 both yes 0 13 1 0
21,856 23 2012 21.73 2011-06-02 15,127 2013-02-01 15,737 610 246 completion pg-pai si 29,812 2012 23.04 2012-09-24 15,607 2012-10-05 15,618 11 148 early dropout pg-pai no 21856_29812 0 0 0 130 0 0
68,152 24 2015 55.21 2014-07-08 16,259 2015-01-29 16,464 205 428 referral pg-pab si 71,405 2015 55.72 2015-01-07 16,442 2015-07-30 16,646 204 243 referral m-pr si 68152_71405 0 0 both yes 1 22 0 0
71,405 24 2015 55.72 2015-01-07 16,442 2015-07-30 16,646 204 243 referral m-pr si 80,579 2015 56.19 2015-06-30 16,616 2015-11-24 16,763 147 428 referral pg-pab si 71405_80579 0 0 both yes 1 30 0 0
80,579 24 2015 56.19 2015-06-30 16,616 2015-11-24 16,763 147 428 referral pg-pab si 90,940 2016 56.56 2015-11-10 16,749 2016-03-11 16,871 122 260 late dropout pg-pai si 80579_90940 0 1 both yes 1 14 0 0
3,961 25 2010 37.41 2010-03-22 14,690 2010-04-15 14,714 24 123 referral pg-pai si 30,636 2012 37.18 2009-12-28 14,606 2012-11-13 15,657 1,051 123 referral pg-pai si 3961_30636 1 1 both yes 1 108 1 1
5,127 25 2010 37.48 2010-04-16 14,715 2010-07-18 14,808 93 117 late dropout pg-pr si 30,636 2012 37.18 2009-12-28 14,606 2012-11-13 15,657 1,051 123 referral pg-pai si 5127_30636 0 1 both yes 0 202 1 1
14,074 26 2011 23.27 2008-11-25 14,208 2011-02-28 15,033 825 109 referral pg-pai si 23,472 2012 23.35 2008-12-25 14,238 2012-01-31 15,370 1,132 109 referral pg-pab si 14074_23472 1 1 both yes 1 795 1 0
14,074 26 2011 23.27 2008-11-25 14,208 2011-02-28 15,033 825 109 referral pg-pai si 25,150 2012 23.53 2009-03-02 14,305 2012-03-21 15,420 1,115 109 referral pg-pai si 14074_25150 1 1 both yes 1 728 1 0
23,472 26 2012 23.35 2008-12-25 14,238 2012-01-31 15,370 1,132 109 referral pg-pab si 25,150 2012 23.53 2009-03-02 14,305 2012-03-21 15,420 1,115 109 referral pg-pai si 23472_25150 1 0 both yes 1 1,065 0 0
14,882 26 2011 25.55 2011-03-07 15,040 2011-06-30 15,155 115 109 referral pg-pab si 23,472 2012 23.35 2008-12-25 14,238 2012-01-31 15,370 1,132 109 referral pg-pab si 14882_23472 1 1 both yes 1 917 1 1
14,882 26 2011 25.55 2011-03-07 15,040 2011-06-30 15,155 115 109 referral pg-pab si 25,150 2012 23.53 2009-03-02 14,305 2012-03-21 15,420 1,115 109 referral pg-pai si 14882_25150 1 1 both yes 1 850 1 1
17,443 26 2011 25.86 2011-07-01 15,156 2011-08-01 15,187 31 109 referral pg-pai si 23,472 2012 23.35 2008-12-25 14,238 2012-01-31 15,370 1,132 109 referral pg-pab si 17443_23472 1 1 both yes 1 949 1 1
17,443 26 2011 25.86 2011-07-01 15,156 2011-08-01 15,187 31 109 referral pg-pai si 25,150 2012 23.53 2009-03-02 14,305 2012-03-21 15,420 1,115 109 referral pg-pai si 17443_25150 1 1 both yes 1 882 1 1
38,847 27 2013 22.41 2011-10-19 15,266 2013-05-29 15,854 588 109 referral pg-pai si 40,104 2013 23.96 2013-05-07 15,832 2013-08-22 15,939 107 117 late adm discharge pg-pr no 38847_40104 0 0 1 22 0 0
26,405 27 2012 22.91 2012-04-18 15,448 2012-08-27 15,579 131 117 late adm discharge pg-pr si 38,847 2013 22.41 2011-10-19 15,266 2013-05-29 15,854 588 109 referral pg-pai si 26405_38847 0 1 both yes 0 313 1 1
30,714 27 2012 23.36 2012-09-28 15,611 2013-03-29 15,793 182 109 referral pg-pab si 38,847 2013 22.41 2011-10-19 15,266 2013-05-29 15,854 588 109 referral pg-pai si 30714_38847 1 1 both yes 1 527 1 1
163,071 28 2019 29.80 2019-02-25 17,952 2019-07-31 18,108 156 408 referral pg-pai si 173,888 2020 29.87 2019-03-21 17,976 2020-07-31 18,474 498 342 referral m-pai si 163071_173888 0 1 both yes 1 132 1 0
173,888 28 2020 29.87 2019-03-21 17,976 2020-07-31 18,474 498 342 referral m-pai si 190,835 2021 31.03 2020-05-19 18,401 2021-02-25 18,683 282 408 completion pg-pai si 173888_190835 0 1 both yes 1 73 0 0
13,414 29 2011 34.92 2009-01-27 14,271 2011-01-31 15,005 734 109 referral pg-pab si 27,700 2012 35.71 2009-11-11 14,559 2012-10-31 15,644 1,085 109 referral pg-pab si 13414_27700 1 1 both yes 1 446 1 0
21,030 29 2012 36.94 2011-02-01 15,006 2012-05-31 15,491 485 109 referral pg-pai si 27,700 2012 35.71 2009-11-11 14,559 2012-10-31 15,644 1,085 109 referral pg-pab si 21030_27700 1 0 both yes 1 932 1 1
43,675 30 2013 31.79 2013-08-07 15,924 2013-11-15 16,024 100 433 late dropout pg-pab si 66,054 2015 32.05 2013-11-11 16,020 2015-03-03 16,497 477 438 referral m-pai si 43675_66054 0 1 both yes 0 4 1 0
66,054 30 2015 32.05 2013-11-11 16,020 2015-03-03 16,497 477 438 referral m-pai si 73,791 2015 33.36 2015-03-02 16,496 2015-03-24 16,518 22 159 early dropout m-pr si 66054_73791 0 0 both yes 1 1 0 0
16,905 31 2011 29.22 2008-07-08 14,068 2012-01-27 15,366 1,298 123 referral pg-pab si 28,504 2012 30.29 2009-08-06 14,462 2012-09-28 15,611 1,149 123 late dropout pg-pab si 16905_28504 1 1 both yes 1 904 0 0
8,967 31 2010 31.59 2010-11-24 14,937 2010-12-20 14,963 26 123 referral pg-pai si 16,905 2011 29.22 2008-07-08 14,068 2012-01-27 15,366 1,298 123 referral pg-pab si 8967_16905 1 1 both yes 1 895 1 1
8,967 31 2010 31.59 2010-11-24 14,937 2010-12-20 14,963 26 123 referral pg-pai si 28,504 2012 30.29 2009-08-06 14,462 2012-09-28 15,611 1,149 123 late dropout pg-pab si 8967_28504 1 1 both yes 1 501 1 1
13,338 31 2011 31.67 2010-12-20 14,963 2011-01-24 14,998 35 117 early adm discharge pg-pr si 16,905 2011 29.22 2008-07-08 14,068 2012-01-27 15,366 1,298 123 referral pg-pab si 13338_16905 0 0 both yes 0 930 1 1
13,338 31 2011 31.67 2010-12-20 14,963 2011-01-24 14,998 35 117 early adm discharge pg-pr si 28,504 2012 30.29 2009-08-06 14,462 2012-09-28 15,611 1,149 123 late dropout pg-pab si 13338_28504 0 1 both yes 0 536 1 1
35,639 32 2013 22.90 2013-01-18 15,723 2013-12-30 16,069 346 225 completion pg-pai si 38,713 2013 23.15 2013-04-22 15,817 2013-09-27 15,975 158 255 late dropout pg-pab si 35639_38713 0 0 both yes 0 252 0 0
35,639 32 2013 22.90 2013-01-18 15,723 2013-12-30 16,069 346 225 completion pg-pai si 51,795 2014 23.66 2013-10-23 16,001 2014-01-30 16,100 99 255 late dropout pg-pab si 35639_51795 0 1 both yes 0 68 0 0
53,516 33 2014 25.98 2014-01-07 16,077 2014-04-01 16,161 84 146 early dropout pg-pab si 87,420 2016 24.82 2012-11-09 15,653 2016-05-18 16,939 1,286 146 referral pg-pai si 53516_87420 1 1 both yes 0 508 1 1
73,632 33 2015 27.01 2015-01-16 16,451 2015-04-30 16,555 104 146 referral pg-pr no 87,420 2016 24.82 2012-11-09 15,653 2016-05-18 16,939 1,286 146 referral pg-pai si 73632_87420 1 1 second yes 1 902 1 1
54,900 34 2014 20.16 2014-02-03 16,104 2014-04-29 16,189 85 212 referral pg-pai si 57,107 2014 20.34 2014-04-10 16,170 2014-06-24 16,245 75 354 early dropout pg-pr si 54900_57107 0 0 both yes 1 19 0 0
57,107 34 2014 20.34 2014-04-10 16,170 2014-06-24 16,245 75 354 early dropout pg-pr si 58,996 2014 20.53 2014-06-19 16,240 2014-07-10 16,261 21 212 early dropout pg-pai si 57107_58996 0 0 both yes 0 5 0 0
3,282 35 2010 37.56 2009-10-05 14,522 2010-01-04 14,613 91 109 late dropout pg-pai si 3,871 2010 37.14 2009-05-05 14,369 2011-06-09 15,134 765 109 late dropout pg-pab si 3282_3871 1 0 both yes 0 244 1 1
2,632 35 2010 37.58 2009-10-12 14,529 2010-01-04 14,613 84 109 early dropout pg-pab si 3,282 2010 37.56 2009-10-05 14,522 2010-01-04 14,613 91 109 late dropout pg-pai si 2632_3282 1 0 both yes 0 91 1 0
2,632 35 2010 37.58 2009-10-12 14,529 2010-01-04 14,613 84 109 early dropout pg-pab si 3,871 2010 37.14 2009-05-05 14,369 2011-06-09 15,134 765 109 late dropout pg-pab si 2632_3871 1 0 both yes 0 244 1 1
70,771 36 2015 21.16 2014-12-05 16,409 2015-05-01 16,556 147 202 referral pg-pai si 87,269 2016 21.56 2015-04-29 16,554 2016-05-29 16,950 396 215 completion pg-pr si 70771_87269 0 1 both yes 1 2 1 0
87,269 36 2016 21.56 2015-04-29 16,554 2016-05-29 16,950 396 215 completion pg-pr si 90,126 2016 22.04 2015-10-21 16,729 2016-11-28 17,133 404 215 completion pg-pr si 87269_90126 1 0 both yes 0 221 1 0
34,867 37 2013 35.76 2012-11-01 15,645 2013-12-23 16,062 417 179 completion pg-pr si 54,152 2014 36.89 2013-12-16 16,055 2014-06-02 16,223 168 365 referral pg-pai si 34867_54152 0 1 both yes 0 7 0 0
54,152 37 2014 36.89 2013-12-16 16,055 2014-06-02 16,223 168 365 referral pg-pai si 58,961 2014 37.34 2014-06-01 16,222 2014-09-01 16,314 92 179 late dropout pg-pr si 54152_58961 0 0 both yes 1 1 0 0
2,678 38 2010 45.60 2009-09-29 14,516 2010-04-30 14,729 213 109 referral pg-pab si 5,505 2010 45.67 2009-10-26 14,543 2010-05-31 14,760 217 109 referral pg-pai si 2678_5505 1 0 both yes 1 186 1 0
2,678 38 2010 45.60 2009-09-29 14,516 2010-04-30 14,729 213 109 referral pg-pab si 10,716 2011 44.93 2009-01-29 14,273 2011-10-31 15,278 1,005 109 referral pg-pab si 2678_10716 1 1 both yes 1 456 1 1
5,505 38 2010 45.67 2009-10-26 14,543 2010-05-31 14,760 217 109 referral pg-pai si 10,716 2011 44.93 2009-01-29 14,273 2011-10-31 15,278 1,005 109 referral pg-pab si 5505_10716 1 1 both yes 1 487 1 1
5,359 39 2010 40.66 2010-05-03 14,732 2010-09-30 14,882 150 251 referral pg-pab si 11,825 2011 41.07 2010-09-29 14,881 2011-01-21 14,995 114 260 referral pg-pai si 5359_11825 0 1 both yes 1 1 0 0
11,825 39 2011 41.07 2010-09-29 14,881 2011-01-21 14,995 114 260 referral pg-pai si 13,130 2011 41.36 2011-01-14 14,988 2011-09-18 15,235 247 234 completion m-pr si 11825_13130 0 0 both yes 1 7 1 0
68,328 40 2015 26.25 2014-08-18 16,300 2015-08-03 16,650 350 430 late dropout pg-pai si 75,755 2015 26.93 2015-04-27 16,552 2015-06-16 16,602 50 488 early dropout m-pr no 68328_75755 0 0 0 98 0 0
68,328 40 2015 26.25 2014-08-18 16,300 2015-08-03 16,650 350 430 late dropout pg-pai si 80,472 2015 27.20 2015-08-01 16,648 2015-10-07 16,715 67 161 referral m-pai si 68328_80472 0 0 both yes 0 2 0 0
36,873 41 2013 27.84 2013-01-21 15,726 2013-06-24 15,880 154 155 late dropout pg-pab si 40,999 2013 28.23 2013-06-12 15,868 2013-06-14 15,870 2 147 early dropout pg-pr no 36873_40999 0 0 0 12 0 0
36,873 41 2013 27.84 2013-01-21 15,726 2013-06-24 15,880 154 155 late dropout pg-pab si 42,419 2013 28.21 2013-06-06 15,862 2013-11-29 16,038 176 254 late adm discharge pg-pai si 36873_42419 0 0 both yes 0 18 1 0
40,999 41 2013 28.23 2013-06-12 15,868 2013-06-14 15,870 2 147 early dropout pg-pr no 42,419 2013 28.21 2013-06-06 15,862 2013-11-29 16,038 176 254 late adm discharge pg-pai si 40999_42419 0 0 second yes 0 8 1 1
17,363 42 2011 33.04 2011-07-14 15,169 2011-09-13 15,230 61 262 referral m-pai si 18,983 2011 33.20 2011-09-12 15,229 2011-11-17 15,295 66 246 referral pg-pai si 17363_18983 0 0 both yes 1 1 1 0
18,983 42 2011 33.20 2011-09-12 15,229 2011-11-17 15,295 66 246 referral pg-pai si 20,194 2011 33.38 2011-11-15 15,293 2011-11-28 15,306 13 243 early adm discharge m-pr si 18983_20194 0 0 both yes 1 2 0 0
11,071 43 2011 19.82 2010-08-20 14,841 2011-01-24 14,998 157 195 late adm discharge pg-pab si 15,672 2011 20.22 2011-01-12 14,986 2019-12-31 18,261 3,275 269 adm truncated pg-pr no 11071_15672 0 0 0 12 1 0
15,672 43 2011 20.22 2011-01-12 14,986 2019-12-31 18,261 3,275 269 adm truncated pg-pr no 20,199 2011 20.70 2011-07-05 15,160 2019-12-31 18,261 3,101 269 adm truncated pg-pr no 15672_20199 1 0 both no 0 3,101 0 0
209,491 44 2022 40.20 2021-10-01 18,901 2022-07-29 19,202 301 168 referral m-pai si 216,140 2022 40.86 2022-05-30 19,142 2022-06-20 19,163 21 795 referral m-pr no 209491_216140 0 0 1 60 0 0
209,491 44 2022 40.20 2021-10-01 18,901 2022-07-29 19,202 301 168 referral m-pai si 216,999 2022 40.94 2022-06-29 19,172 2022-08-24 19,228 56 795 early dropout m-pai si 209491_216999 0 0 both yes 1 30 0 0
9,504 45 2010 28.01 2010-12-01 14,944 2010-12-31 14,974 30 185 referral pg-pab si 12,774 2011 28.04 2010-12-13 14,956 2011-10-20 15,267 311 185 referral pg-pai si 9504_12774 1 1 both yes 1 18 1 0
12,774 45 2011 28.04 2010-12-13 14,956 2011-10-20 15,267 311 185 referral pg-pai si 22,564 2012 28.89 2011-10-19 15,266 2012-03-30 15,429 163 197 late adm discharge m-pr no 12774_22564 0 1 1 1 0 0
43,099 46 2013 40.11 2013-08-02 15,919 2013-11-14 16,023 104 469 referral pg-pab si 51,830 2014 40.33 2013-10-22 16,000 2014-08-06 16,288 288 137 completion m-pai si 43099_51830 0 1 both yes 1 23 1 0
51,830 46 2014 40.33 2013-10-22 16,000 2014-08-06 16,288 288 137 completion m-pai si 68,070 2015 41.11 2014-08-04 16,286 2015-10-21 16,729 443 561 completion pg-pai si 51830_68070 0 1 both yes 0 2 1 0
2,429 47 2010 40.19 2010-01-25 14,634 2010-04-16 14,715 81 219 early dropout m-pr si 22,971 2012 39.03 2008-11-28 14,211 2012-08-30 15,582 1,371 123 referral pg-pab si 2429_22971 0 1 both yes 0 504 1 1
10,739 47 2011 40.62 2010-07-01 14,791 2011-04-19 15,083 292 205 late adm discharge pg-pai si 22,971 2012 39.03 2008-11-28 14,211 2012-08-30 15,582 1,371 123 referral pg-pab si 10739_22971 0 1 both yes 0 872 1 1
60,019 47 2014 44.61 2014-06-26 16,247 2014-10-30 16,373 126 353 referral pg-pai si 69,890 2015 44.94 2014-10-28 16,371 2015-01-12 16,447 76 345 referral m-pr si 60019_69890 0 1 both yes 1 2 0 0
69,890 47 2015 44.94 2014-10-28 16,371 2015-01-12 16,447 76 345 referral m-pr si 72,004 2015 45.13 2015-01-02 16,437 2015-06-19 16,605 168 347 referral pg-pai si 69890_72004 0 0 both yes 1 10 1 0
208,911 48 2022 54.74 2021-09-14 18,884 2022-02-08 19,031 147 750 referral pg-pab si 212,965 2022 55.05 2022-01-05 18,997 2022-11-02 19,298 301 272 referral pg-pai si 208911_212965 0 0 both yes 1 34 1 0
212,833 48 2022 55.15 2022-02-09 19,032 2022-02-21 19,044 12 272 early dropout pg-pai si 212,965 2022 55.05 2022-01-05 18,997 2022-11-02 19,298 301 272 referral pg-pai si 212833_212965 1 0 both yes 0 47 1 1
27,819 49 2012 35.06 2012-06-20 15,511 2012-07-30 15,551 40 204 referral pg-pab si 28,664 2012 35.12 2012-07-12 15,533 2012-08-16 15,568 35 205 referral pg-pai si 27819_28664 0 0 both yes 1 18 0 0
28,664 49 2012 35.12 2012-07-12 15,533 2012-08-16 15,568 35 205 referral pg-pai si 29,097 2012 35.19 2012-08-07 15,559 2012-08-16 15,568 9 215 early dropout pg-pr si 28664_29097 0 0 both yes 1 9 0 0
1,281 50 2010 33.64 2009-12-23 14,601 2010-05-03 14,732 131 249 late dropout pg-pab si 10,331 2011 33.90 2010-03-29 14,697 2011-03-02 15,035 338 300 completion m-pr no 1281_10331 0 1 0 35 1 0
10,331 50 2011 33.90 2010-03-29 14,697 2011-03-02 15,035 338 300 completion m-pr no 13,983 2011 34.81 2011-02-24 15,029 2011-06-07 15,132 103 262 late dropout m-pab si 10331_13983 0 0 second yes 0 6 0 0
50,065 51 2014 37.88 2013-07-31 15,917 2014-03-21 16,150 233 125 referral pg-pai si 55,877 2014 38.47 2014-03-05 16,134 2014-06-16 16,237 103 125 referral pg-pab si 50065_55877 1 0 both yes 1 16 0 0
55,877 51 2014 38.47 2014-03-05 16,134 2014-06-16 16,237 103 125 referral pg-pab si 85,681 2016 38.68 2014-05-19 16,209 2016-05-16 16,937 728 353 completion pg-pai no 55877_85681 0 1 1 28 1 0
25,237 52 2012 38.88 2012-03-06 15,405 2012-04-12 15,442 37 171 referral pg-pai si 65,490 2015 37.64 2010-12-10 14,953 2015-05-29 16,584 1,631 166 completion pg-pai si 25237_65490 0 1 both yes 1 489 1 1
33,552 52 2013 39.29 2012-08-01 15,553 2013-02-01 15,737 184 171 referral pg-pai si 65,490 2015 37.64 2010-12-10 14,953 2015-05-29 16,584 1,631 166 completion pg-pai si 33552_65490 0 1 both yes 1 784 1 1
24,667 53 2012 31.20 2012-02-15 15,385 2012-05-18 15,478 93 161 referral m-pai si 27,109 2012 31.45 2012-05-17 15,477 2012-10-03 15,616 139 275 late dropout m-pr si 24667_27109 0 0 both yes 1 1 1 0
27,109 53 2012 31.45 2012-05-17 15,477 2012-10-03 15,616 139 275 late dropout m-pr si 48,001 2014 31.83 2012-10-02 15,615 2015-02-16 16,482 867 161 late dropout m-pai si 27109_48001 0 1 both yes 0 1 1 0
58,442 54 2014 34.85 2014-06-04 16,225 2014-09-25 16,338 113 225 referral pg-pab si 62,112 2014 35.15 2014-09-22 16,335 2014-10-30 16,373 38 235 early dropout pg-pr si 58442_62112 0 0 both yes 1 3 0 0
62,112 54 2014 35.15 2014-09-22 16,335 2014-10-30 16,373 38 235 early dropout pg-pr si 63,636 2014 35.22 2014-10-16 16,359 2015-03-16 16,510 151 225 late dropout pg-pab si 62112_63636 0 0 both yes 0 14 1 0
194 55 2010 23.63 2009-10-08 14,525 19,475 275 early dropout m-pr si 4,670 2010 24.19 2010-04-29 14,728 2010-05-03 14,732 4 159 early dropout m-pr si 194_4670 0 0 both yes 0 4,747 0
194 55 2010 23.63 2009-10-08 14,525 19,475 275 early dropout m-pr si 24,666 2012 25.97 2012-02-10 15,380 2012-06-01 15,492 112 161 referral m-pai si 194_24666 0 1 both yes 0 4,095 0
194 55 2010 23.63 2009-10-08 14,525 19,475 275 early dropout m-pr si 200,171 2021 35.39 2021-07-13 18,821 2021-09-01 18,871 50 143 early dropout pg-pai si 194_200171 0 1 both yes 0 654 0
35,516 56 2013 34.71 2013-01-14 15,719 2013-04-29 15,824 105 249 referral pg-pab si 37,922 2013 34.83 2013-02-25 15,761 2013-05-01 15,826 65 238 referral pg-pai no 35516_37922 0 0 1 63 0 0
35,516 56 2013 34.71 2013-01-14 15,719 2013-04-29 15,824 105 249 referral pg-pab si 39,524 2013 34.98 2013-04-22 15,817 2013-06-17 15,873 56 285 early dropout pg-pr si 35516_39524 0 0 both yes 1 7 0 0
37,922 56 2013 34.83 2013-02-25 15,761 2013-05-01 15,826 65 238 referral pg-pai no 39,524 2013 34.98 2013-04-22 15,817 2013-06-17 15,873 56 285 early dropout pg-pr si 37922_39524 0 0 second yes 1 9 0 0
2,603 57 2010 23.74 2010-01-20 14,629 19,475 117 early dropout pg-pr si 24,854 2012 25.81 2012-02-16 15,386 2012-03-07 15,406 20 120 referral pg-pai si 2603_24854 0 1 both yes 0 4,089 0
2,603 57 2010 23.74 2010-01-20 14,629 19,475 117 early dropout pg-pr si 25,221 2012 25.87 2012-03-08 15,407 2012-05-08 15,468 61 362 early dropout m-pr si 2603_25221 0 1 both yes 0 4,068 0
42,920 58 2013 39.48 2012-08-23 15,575 2013-11-01 16,010 435 248 referral pg-pai si 51,617 2014 40.65 2013-10-24 16,002 2014-04-20 16,180 178 271 late adm discharge pg-pr si 42920_51617 0 1 both yes 1 8 0 0
36,594 58 2013 39.84 2013-01-02 15,707 2013-08-01 15,918 211 248 late adm discharge pg-pai no 42,920 2013 39.48 2012-08-23 15,575 2013-11-01 16,010 435 248 referral pg-pai si 36594_42920 1 0 second yes 0 343 1 1
1,752 59 2010 27.99 2009-08-17 14,473 2010-11-30 14,943 470 287 late dropout pg-pab si 29,412 2012 28.03 2009-08-30 14,486 2013-03-12 15,776 1,290 287 late dropout pg-pab si 1752_29412 1 1 both yes 0 457 1 0
20,899 59 2012 29.37 2011-01-03 14,977 2012-05-02 15,462 485 287 late dropout pg-pab si 29,412 2012 28.03 2009-08-30 14,486 2013-03-12 15,776 1,290 287 late dropout pg-pab si 20899_29412 1 0 both yes 0 976 1 1
26,804 60 2012 33.75 2012-05-24 15,484 2012-11-27 15,671 187 146 referral pg-pab si 31,405 2012 34.17 2012-10-25 15,638 2012-12-17 15,691 53 142 completion pg-pr si 26804_31405 0 0 both yes 1 33 0 0
26,804 60 2012 33.75 2012-05-24 15,484 2012-11-27 15,671 187 146 referral pg-pab si 48,099 2014 34.26 2012-11-25 15,669 2014-03-31 16,160 491 142 completion m-pr si 26804_48099 0 1 both yes 1 2 1 0
31,405 60 2012 34.17 2012-10-25 15,638 2012-12-17 15,691 53 142 completion pg-pr si 48,099 2014 34.26 2012-11-25 15,669 2014-03-31 16,160 491 142 completion m-pr si 31405_48099 1 1 both yes 0 22 1 0
25,083 61 2012 34.80 2012-02-21 15,391 2012-06-04 15,495 104 251 referral pg-pab si 27,357 2012 35.05 2012-05-23 15,483 2012-11-19 15,663 180 260 referral pg-pai si 25083_27357 0 0 both yes 1 12 1 0
27,357 61 2012 35.05 2012-05-23 15,483 2012-11-19 15,663 180 260 referral pg-pai si 34,809 2013 35.53 2012-11-14 15,658 2013-05-31 15,856 198 243 late dropout m-pr si 27357_34809 0 1 both yes 1 5 1 0
73,881 62 2015 28.49 2015-03-25 16,519 2015-05-01 16,556 37 260 referral pg-pai si 76,076 2015 28.58 2015-04-27 16,552 2015-06-02 16,588 36 358 early adm discharge pg-pr si 73881_76076 0 0 both yes 1 4 0 0
76,076 62 2015 28.58 2015-04-27 16,552 2015-06-02 16,588 36 358 early adm discharge pg-pr si 77,408 2015 28.66 2015-05-27 16,582 2015-08-12 16,659 77 260 referral pg-pai si 76076_77408 0 0 both yes 0 6 1 0
8,239 63 2010 27.79 2010-10-07 14,889 19,475 275 early dropout m-pr si 32,167 2013 28.30 2011-04-12 15,076 2013-12-18 16,057 981 224 late adm discharge pg-pai si 8239_32167 0 1 both yes 0 4,399 0
8,239 63 2010 27.79 2010-10-07 14,889 19,475 275 early dropout m-pr si 97,354 2016 33.42 2016-05-23 16,944 2016-11-01 17,106 162 277 referral m-pr si 8239_97354 0 1 both yes 0 2,531 0
11,442 64 2011 28.84 2007-10-14 13,800 2011-01-31 15,005 1,205 136 late adm discharge pg-pai no 13,156 2011 31.80 2010-10-01 14,883 2011-08-27 15,213 330 136 late adm discharge pg-pai si 11442_13156 1 0 second yes 0 122 0 0
753 64 2010 30.00 2008-12-11 14,224 2010-02-16 14,656 432 136 referral pg-pab si 11,442 2011 28.84 2007-10-14 13,800 2011-01-31 15,005 1,205 136 late adm discharge pg-pai no 753_11442 1 1 1 856 1 1
8,764 65 2010 40.96 2007-11-09 13,826 2011-07-28 15,183 1,357 109 completion pg-pab si 37,235 2013 43.27 2010-03-01 14,669 2013-02-28 15,764 1,095 109 referral pg-pab si 8764_37235 1 1 both yes 0 514 0 0
8,764 65 2010 40.96 2007-11-09 13,826 2011-07-28 15,183 1,357 109 completion pg-pab si 85,373 2016 43.35 2010-03-31 14,699 2016-07-01 16,983 2,284 109 completion pg-pai si 8764_85373 1 1 both yes 0 484 1 0
5,531 65 2010 42.96 2009-11-09 14,557 2010-09-30 14,882 325 109 late adm discharge pg-pai si 8,764 2010 40.96 2007-11-09 13,826 2011-07-28 15,183 1,357 109 completion pg-pab si 5531_8764 1 0 both yes 0 1,056 1 1
5,531 65 2010 42.96 2009-11-09 14,557 2010-09-30 14,882 325 109 late adm discharge pg-pai si 37,235 2013 43.27 2010-03-01 14,669 2013-02-28 15,764 1,095 109 referral pg-pab si 5531_37235 1 1 both yes 0 213 1 0
5,531 65 2010 42.96 2009-11-09 14,557 2010-09-30 14,882 325 109 late adm discharge pg-pai si 85,373 2016 43.35 2010-03-31 14,699 2016-07-01 16,983 2,284 109 completion pg-pai si 5531_85373 1 1 both yes 0 183 1 0
37,235 65 2013 43.27 2010-03-01 14,669 2013-02-28 15,764 1,095 109 referral pg-pab si 85,373 2016 43.35 2010-03-31 14,699 2016-07-01 16,983 2,284 109 completion pg-pai si 37235_85373 1 1 both yes 1 1,065 1 0
24,760 66 2012 30.15 2012-02-01 15,371 2012-09-03 15,586 215 200 referral pg-pai si 33,553 2013 30.67 2012-08-10 15,562 2013-08-22 15,939 377 183 completion pg-pr no 24760_33553 0 1 1 24 1 0
33,553 66 2013 30.67 2012-08-10 15,562 2013-08-22 15,939 377 183 completion pg-pr no 43,437 2013 31.65 2013-08-02 15,919 2013-12-01 16,040 121 200 referral pg-pai si 33553_43437 0 0 second yes 0 20 0 0
17,895 67 2011 38.61 2008-08-29 14,120 2011-09-01 15,218 1,098 123 late dropout pg-pai si 26,234 2012 39.27 2009-04-27 14,361 2012-08-30 15,582 1,221 123 late dropout pg-pai si 17895_26234 1 1 both yes 0 857 1 0
1,821 67 2010 39.02 2009-01-27 14,271 2010-09-16 14,868 597 123 referral pg-pai si 17,895 2011 38.61 2008-08-29 14,120 2011-09-01 15,218 1,098 123 late dropout pg-pai si 1821_17895 1 1 both yes 1 748 1 1
1,821 67 2010 39.02 2009-01-27 14,271 2010-09-16 14,868 597 123 referral pg-pai si 26,234 2012 39.27 2009-04-27 14,361 2012-08-30 15,582 1,221 123 late dropout pg-pai si 1821_26234 1 1 both yes 1 507 1 0
Note.Each row represents an overlap. Variables ending with '_1' are the first case, and variables ending with '_2' correspond to the second case;
a= date; b= numeric
Same Center ID= If both cases share the same Center ID
Financed By SENDA= If both cases are financed by SENDA;
Referral= If the cause of discharge is the referral from another center (1= Referral);
Days Overlapped= Difference between the date of admission of the earlier treatment, and the date of discharge of the latter treatment
2nd treatment has more treatment days= Earlier treatment has more days of treatment

0.b.1 Overlappings due to missing discharge dates

First, we checked if a case had any missing value in the discharge date of the earlier treatment.

Code
wdpath<-
paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))))
bpmn::bpmn(paste0(wdpath, "cons/_input/overlapped_ranges_decision_tree_miss_disch_dates.bpmn"))

Decision tree for overlapping due to missing discharge dates

Apply the decision tree to the overlapping cases with missing dates of discharge, first identifying scenarios.

Code
eliminate_0c_a1<-
CONS_C1_df_dup_overlaps_COMP |>
  filter(!(rn_1 %in% overlaps_after_miss_appear_more_than_one_time | 
         rn_2 %in% overlaps_after_miss_appear_more_than_one_time))|> 
  filter(is.na(disch_date_1)|is.na(disch_date_2)) |> 
  filter(adm_date_1>adm_date_2, is.na(disch_date_1), !is.na(disch_date_2)) |>
    (\(df) {
    cat(paste0("0c.a1.Number of cases with missing dates of discharge, first obs. within second: ", formatC(nrow(df)*2, big.mark=",")),"\n")
    cat(paste0("0c.a1.Number of patients with missing dates of discharge, first obs. within second: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
    df
  })() |> 
  tidytable::pivot_longer(
    cols = matches("_[12]$"),  # All columns ending with _1 or _2
    names_to = c(".value", "wave"),
    names_pattern = "(.+)_([12])",
    values_drop_na = FALSE) |>  #2025: from 1659 to 3318
  mutate(wave= as.numeric(wave)) |> 
  filter(is.na(disch_date)) |> 
  pull(rn) |> as.numeric()
# 0c.a1.Number of cases with missing dates of discharge, first obs. within second: 2 
# 0c.a1.Number of patients with missing dates of discharge, first obs. within second: 1
keep_0c_a1<-
CONS_C1_df_dup_overlaps_COMP |>
  filter(!(rn_1 %in% overlaps_after_miss_appear_more_than_one_time | 
         rn_2 %in% overlaps_after_miss_appear_more_than_one_time))|>   
  filter(is.na(disch_date_1)|is.na(disch_date_2)) |> 
  filter(adm_date_1>adm_date_2, is.na(disch_date_1), !is.na(disch_date_2)) |>
    (\(df) {
    cat(paste0("0c.a1.Number of cases with missing dates of discharge, first obs. within second: ", formatC(nrow(df)*2, big.mark=",")),"\n")
    cat(paste0("0c.a1.Number of patients with missing dates of discharge, first obs. within second: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
    df
  })() |> 
  tidytable::pivot_longer(
    cols = matches("_[12]$"),  # All columns ending with _1 or _2
    names_to = c(".value", "wave"),
    names_pattern = "(.+)_([12])",
    values_drop_na = FALSE) |>  #2025: from 1659 to 3318
  mutate(wave= as.numeric(wave)) |> 
  filter(!is.na(disch_date)) |> 
  pull(rn) |> as.numeric()
# 0c.a1.Number of cases with missing dates of discharge, first obs. within second: 2 
# 0c.a1.Number of patients with missing dates of discharge, first obs. within second: 1

replace_miss_dischdate_0c_a2<-
CONS_C1_df_dup_overlaps_COMP |>
  filter(!(rn_1 %in% overlaps_after_miss_appear_more_than_one_time | 
         rn_2 %in% overlaps_after_miss_appear_more_than_one_time))|> 
  filter(is.na(disch_date_1)|is.na(disch_date_2)) |> 
  filter(adm_date_1>adm_date_2, !is.na(disch_date_1)) |>
    (\(df) {
    cat(paste0("0c.a2.Number of cases with missing dates of discharge, admission date of first tr. replace miss 2nd disch date: ", formatC(nrow(df)*2, big.mark=",")),"\n")
    cat(paste0("0c.a2.Number of patients with missing dates of discharge, admission date of first tr. replace miss 2nd disch date: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
    df
  })() |> 
  mutate(disch_date_num_2_rec= adm_date_rec_num_1-1)
# 0c.a2.Number of cases with missing dates of discharge, admission date of first tr. replace miss 2nd disch date: 0 
# 0c.a2.Number of patients with missing dates of discharge, admission date of first tr. replace miss 2nd disch date: 0 

replace_miss_dischdate_0c_a3_a<-
CONS_C1_df_dup_overlaps_COMP |>
  filter(!(rn_1 %in% overlaps_after_miss_appear_more_than_one_time | 
         rn_2 %in% overlaps_after_miss_appear_more_than_one_time))|> 
    filter(is.na(disch_date_1)|is.na(disch_date_2)) |> 
    filter(adm_date_1>adm_date_2, is.na(disch_date_1), is.na(disch_date_2))|> 
    (\(df) {
    cat(paste0("0c.a3.a.Number of cases with both missing dates of discharge (db retrieval=2022), replace 2nd discharge date with admission date of the 1st: ", formatC(nrow(df)*2, big.mark=",")),"\n")
    cat(paste0("0c.a3.a.Number of patients with both missing dates of discharge (db retrieval=2022), replace 2nd discharge date with admission date of the 1st: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
    df
  })()|> 
  mutate(disch_date_num_2_rec= adm_date_rec_num_1-1)
# 0c.a3.a.Number of cases with both missing dates of discharge (db retrieval=2022), replace 2nd discharge date with admission date of the 1st: 0 
# 0c.a3.a.Number of patients with both missing dates of discharge (db retrieval=2022), replace 2nd discharge date with admission date of the 1st: 0 

discard_cases_0c_a3_b<-
CONS_C1_df_dup_overlaps_COMP |>
  filter(!(rn_1 %in% overlaps_after_miss_appear_more_than_one_time | 
         rn_2 %in% overlaps_after_miss_appear_more_than_one_time))|> 
    filter(is.na(disch_date_1)|is.na(disch_date_2)) |> 
    filter(adm_date_1>adm_date_2, is.na(disch_date_1), is.na(disch_date_2))|> 
    (\(df) {
    cat(paste0("0c.a3.b.Number of cases with both missing dates of discharge (db retrieval!=2022): ", formatC(nrow(df)*2, big.mark=",")),"\n")
    cat(paste0("0c.a3.b.Number of patients with both missing dates of discharge (db retrieval!=2022): ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
    df
  })() |> 
  select(rn_1, rn_2) |> as.numeric()
# 0c.a3.b.Number of cases with both missing dates of discharge (db retrieval!=2022): 0 
# 0c.a3.b.Number of patients with both missing dates of discharge (db retrieval!=2022): 0 

replace_miss_dischdate_0c_b3_a<-
CONS_C1_df_dup_overlaps_COMP |>
  filter(!(rn_1 %in% overlaps_after_miss_appear_more_than_one_time | 
         rn_2 %in% overlaps_after_miss_appear_more_than_one_time))|> 
  filter(is.na(disch_date_1)|is.na(disch_date_2)) |> 
  filter(adm_date_1<adm_date_2, is.na(disch_date_1), is.na(disch_date_2),ano_bd_2==2022)|>
    (\(df) {
    cat(paste0("0c.b3.a.Number of cases with missing dates of discharge, admission date of 2nd tr. (db retrieval=2022) replace miss 1st disch date: ", formatC(nrow(df)*2, big.mark=",")),"\n")
    cat(paste0("0c.b3.a.Number of patients with missing dates of discharge, admission date of 2nd tr. (db retrieval=2022) replace miss 1st disch date: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
    df
  })()|> 
  mutate(disch_date_num_1_rec= adm_date_rec_num_2-1)
# c.b3.a.Number of cases with missing dates of discharge, admission date of 2nd tr. (db retrieval=2022) replace miss 1st disch date: 2 
# 0c.b3.a.Number of patients with missing dates of discharge, admission date of 2nd tr. (db retrieval=2022) replace miss 1st disch date: 1 

discard_0c_b3_b<-
CONS_C1_df_dup_overlaps_COMP |>
  filter(!(rn_1 %in% overlaps_after_miss_appear_more_than_one_time | 
         rn_2 %in% overlaps_after_miss_appear_more_than_one_time))|> 
  filter(is.na(disch_date_1)|is.na(disch_date_2)) |> 
  filter(adm_date_1<adm_date_2, is.na(disch_date_1), is.na(disch_date_2), ano_bd_2!=2022) |>
    (\(df) {
    cat(paste0("0c.b3.b.Number of cases with missing dates of discharge, both treatments are not 2022: ", formatC(nrow(df)*2, big.mark=",")),"\n")
    cat(paste0("0c.b3.b.Number of patients with missing dates of discharge, both treatments are not 2022: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
    df
  })() |> 
  select(rn_1, rn_2) |> as.numeric()
# 0c.b3.b.Number of cases with missing dates of discharge, both treatments are not 2022: 0 
# 0c.b3.b.Number of patients with missing dates of discharge, both treatments are not 2022: 0 

eliminate_0c_b1<-
CONS_C1_df_dup_overlaps_COMP |>
  filter(!(rn_1 %in% overlaps_after_miss_appear_more_than_one_time | 
         rn_2 %in% overlaps_after_miss_appear_more_than_one_time))|> 
  filter(is.na(disch_date_1)|is.na(disch_date_2)) |> 
  filter(adm_date_1<adm_date_2, is.na(disch_date_2), !is.na(disch_date_1)) |>
    (\(df) {
    cat(paste0("0c.b1.Number of cases with missing dates of discharge, second obs. within first: ", formatC(nrow(df)*2, big.mark=",")),"\n")
    cat(paste0("0c.b1.Number of patients with missing dates of discharge, second obs. within first: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
    df
  })() |> 
  tidytable::pivot_longer(
    cols = matches("_[12]$"),  # All columns ending with _1 or _2
    names_to = c(".value", "wave"),
    names_pattern = "(.+)_([12])",
    values_drop_na = FALSE) |>  #2025: from 1659 to 3318
  mutate(wave= as.numeric(wave)) |> 
  arrange(pair_id) |> 
  filter(is.na(disch_date)) |> 
  pull(rn) |>  as.numeric()

keep_0c_b1<-
CONS_C1_df_dup_overlaps_COMP |>
  filter(!(rn_1 %in% overlaps_after_miss_appear_more_than_one_time | 
         rn_2 %in% overlaps_after_miss_appear_more_than_one_time))|> 
  filter(is.na(disch_date_1)|is.na(disch_date_2)) |> 
  filter(adm_date_1<adm_date_2, is.na(disch_date_2), !is.na(disch_date_1)) |>
    (\(df) {
    cat(paste0("0c.b1.Number of cases with missing dates of discharge, second obs. within first: ", formatC(nrow(df)*2, big.mark=",")),"\n")
    cat(paste0("0c.b1.Number of patients with missing dates of discharge, second obs. within first: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
    df
  })() |> 
  tidytable::pivot_longer(
    cols = matches("_[12]$"),  # All columns ending with _1 or _2
    names_to = c(".value", "wave"),
    names_pattern = "(.+)_([12])",
    values_drop_na = FALSE) |>  #2025: from 1659 to 3318
  mutate(wave= as.numeric(wave)) |> 
  arrange(pair_id) |> 
  filter(!is.na(disch_date)) |> 
  pull(rn) |>  as.numeric()
# 0c.b1.Number of cases with missing dates of discharge, second obs. within first: 8 
# 0c.b1.Number of patients with missing dates of discharge, second obs. within first: 4

replace_miss_dischdate_0c_b2<-
CONS_C1_df_dup_overlaps_COMP |>
  filter(!(rn_1 %in% overlaps_after_miss_appear_more_than_one_time | 
         rn_2 %in% overlaps_after_miss_appear_more_than_one_time))|> 
  filter(is.na(disch_date_1)|is.na(disch_date_2)) |> 
  filter(adm_date_1<adm_date_2, !is.na(disch_date_2)) |>
    (\(df) {
    cat(paste0("0c.b2.Number of cases with missing dates of discharge, admission date of first tr. replace miss 1st disch date: ", formatC(nrow(df)*2, big.mark=",")),"\n")
    cat(paste0("0c.b2.Number of patients with missing dates of discharge, admission date of first tr. replace miss 1st disch date: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
    df
  })() |> 
  mutate(disch_date_num_1_rec= adm_date_rec_num_2-1)
# 0c.b2.Number of cases with missing dates of discharge, admission date of first tr. replace miss 1st disch date: 4 
# 0c.b2.Number of patients with missing dates of discharge, admission date of first tr. replace miss 1st disch date: 2 

invisible("If you want to explore the source of errors")
# select(replace_miss_dischdate_0c_b2, hash_key, adm_date_1, disch_date_num_1_rec) |>
#   left_join(select(SISTRAT23_c1_2010_2022_df_prev1, hash_key, adm_date, fecha_egreso_de_tratamiento), by=c("hash_key"="hash_key", "adm_date_1"="adm_date"))
0c.a1.Number of cases with missing dates of discharge, first obs. within second: 2 
0c.a1.Number of patients with missing dates of discharge, first obs. within second: 1 
0c.a1.Number of cases with missing dates of discharge, first obs. within second: 2 
0c.a1.Number of patients with missing dates of discharge, first obs. within second: 1 
0c.a2.Number of cases with missing dates of discharge, admission date of first tr. replace miss 2nd disch date: 0 
0c.a2.Number of patients with missing dates of discharge, admission date of first tr. replace miss 2nd disch date: 0 
0c.a3.a.Number of cases with both missing dates of discharge (db retrieval=2022), replace 2nd discharge date with admission date of the 1st: 0 
0c.a3.a.Number of patients with both missing dates of discharge (db retrieval=2022), replace 2nd discharge date with admission date of the 1st: 0 
0c.a3.b.Number of cases with both missing dates of discharge (db retrieval!=2022): 0 
0c.a3.b.Number of patients with both missing dates of discharge (db retrieval!=2022): 0 
0c.b3.a.Number of cases with missing dates of discharge, admission date of 2nd tr. (db retrieval=2022) replace miss 1st disch date: 2 
0c.b3.a.Number of patients with missing dates of discharge, admission date of 2nd tr. (db retrieval=2022) replace miss 1st disch date: 1 
0c.b3.b.Number of cases with missing dates of discharge, both treatments are not 2022: 0 
0c.b3.b.Number of patients with missing dates of discharge, both treatments are not 2022: 0 
0c.b1.Number of cases with missing dates of discharge, second obs. within first: 8 
0c.b1.Number of patients with missing dates of discharge, second obs. within first: 4 
0c.b1.Number of cases with missing dates of discharge, second obs. within first: 8 
0c.b1.Number of patients with missing dates of discharge, second obs. within first: 4 
0c.b2.Number of cases with missing dates of discharge, admission date of first tr. replace miss 1st disch date: 4 
0c.b2.Number of patients with missing dates of discharge, admission date of first tr. replace miss 1st disch date: 2 

We apply the scenarios found to the main database.

Code
#eliminate_0c_a1
#keep_0c_a1
#eliminate_0c_b1
#keep_0c_b1
#discard_cases_0c_a3_b
#discard_0c_b3_b

hashkeys_overlapped_discarded_missing_dates<-
  rbind.data.frame(filter(SISTRAT23_c1_2010_2022_df_prev1h, rn %in% discard_cases_0c_a3_b[!is.na(discard_cases_0c_a3_b)]),
  filter(SISTRAT23_c1_2010_2022_df_prev1h, rn %in% discard_0c_b3_b[!is.na(discard_0c_b3_b)])) |> distinct(hash_key) |> pull(hash_key)

SISTRAT23_c1_2010_2022_df_prev1i<-
  SISTRAT23_c1_2010_2022_df_prev1h|>
      (\(df) {
    cat(paste0("4. Database before correcting overlapping with missing discharge dates, obs.: ", formatC(nrow(df), big.mark=",")),"\n")
    cat(paste0("4. Database before correcting overlapping with missing discharge dates, RUNs: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
    df
  })()|> 
  mutate(OBS= case_when(hash_key %in% hashkeys_overlapped_discarded_missing_dates~paste0(as.character(OBS),";","4.1.0c.a/b3.b.Eliminate overlapping cases with both missing dates prior to 2022"), T~ OBS))|> 
  filter(!rn %in% c(discard_cases_0c_a3_b, discard_0c_b3_b))|> 
  mutate(OBS= case_when(rn %in% c(keep_0c_a1, keep_0c_b1)~paste0(as.character(OBS),";","4.2.0c.a1/b1.Eliminate overlapping cases with episodes within others and missing discharge dates"), T~ OBS))|> 
  filter(!rn %in% c(eliminate_0c_a1, eliminate_0c_b1))|>
  mutate(OBS= case_when(rn %in% c(replace_miss_dischdate_0c_a2$rn_2, replace_miss_dischdate_0c_b2$rn_1, replace_miss_dischdate_0c_a3_a$rn_2, replace_miss_dischdate_0c_b3_a$rn_1)~ paste0(as.character(OBS),";","4.3.0c.b2/a2/a3.a/b3.a.Replace missing discharge date with admission date of the first treatment minus 1 day"), T~ OBS)) |>
  left_join(replace_miss_dischdate_0c_a2[, c("rn_2", "disch_date_num_2_rec")], by=c("rn"="rn_2"))|>
  left_join(replace_miss_dischdate_0c_b2[, c("rn_1", "disch_date_num_1_rec")], by=c("rn"="rn_1"))|>
  left_join(replace_miss_dischdate_0c_a3_a[, c("rn_2", "disch_date_num_2_rec")], by=c("rn"="rn_2"), suffix = c("_a2", "_a3_a"))|>
  left_join(replace_miss_dischdate_0c_b3_a[, c("rn_1", "disch_date_num_1_rec")], by=c("rn"="rn_1"), suffix = c("_b2", "_b3_a"))|>
  mutate(disch_date_num_rec= case_when(!is.na(disch_date_num_2_rec_a2 )~ disch_date_num_2_rec_a2, !is.na(disch_date_num_1_rec_b2)~ disch_date_num_1_rec_b2, !is.na(disch_date_num_2_rec_a3_a)~ disch_date_num_2_rec_a3_a, !is.na(disch_date_num_1_rec_b3_a)~ disch_date_num_1_rec_b3_a, T~ disch_date_rec0_num)) |>
  mutate(tr_compliance_rec= case_when(!is.na(disch_date_num_2_rec_a2)~ NA_character_, !is.na(disch_date_num_1_rec_b2)~ NA_character_, !is.na(disch_date_num_2_rec_a3_a)~ NA_character_, !is.na(disch_date_num_1_rec_b3_a)~ NA_character_, T~ tr_compliance_rec))|>
  mutate(dit_rec2= disch_date_num_rec-adm_date_rec_num)|>
  mutate(tr_compliance_rec= case_when(!is.na(disch_date_num_2_rec_a2)~ NA_character_, !is.na(disch_date_num_1_rec_b2)~ NA_character_, !is.na(disch_date_num_2_rec_a3_a)~ NA_character_, !is.na(disch_date_num_1_rec_b3_a)~ NA_character_, T~ tr_compliance_rec))|>
  select(-disch_date_num_2_rec_a2, -disch_date_num_1_rec_b2, -disch_date_num_2_rec_a3_a, -disch_date_num_1_rec_b3_a)|>
        (\(df) {
    cat(paste0("4. Database after correcting overlapping with missing discharge dates, obs.: ", formatC(nrow(df), big.mark=",")),"\n")
    cat(paste0("4. Database after correcting overlapping with missing discharge dates, RUNs: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
    if (nrow(df) > nrow(SISTRAT23_c1_2010_2022_df_prev1h))stop("Error: Added treatment episodes in the process")
    df
  })()
# 4. Database before correcting overlapping with missing discharge dates, obs.: 150,187 
# 4. Database before correcting overlapping with missing discharge dates, RUNs: 106,283 
# 4. Database after correcting overlapping with missing discharge dates, obs.: 150,182 
# 4. Database after correcting overlapping with missing discharge dates, RUNs: 106,283 
4. Database before correcting overlapping with missing discharge dates, obs.: 150,187 
4. Database before correcting overlapping with missing discharge dates, RUNs: 106,283 
4. Database after correcting overlapping with missing discharge dates, obs.: 150,182 
4. Database after correcting overlapping with missing discharge dates, RUNs: 106,283 

The database SISTRAT23_c1_2010_2022_df_prev1i includes overlapping correction to account for missing discharge dates. We replaced the dates in disch_date_num_rec, dit_rec2 accounting for the new discharge date and tr_compliance_rec were replaced with missing values for the cases that had missing discharge dates with replaced values.

0.b.2 After replacement for missing dates of discharge

Code
CONS_C1_df_dup_intervals_after_miss<- 
  SISTRAT23_c1_2010_2022_df_prev1i|>
    mutate(disch_date_num_miss= ifelse(is.na(disch_date_num_rec), 19475, disch_date_num_rec))|> #equivalente a 2023-04-28 as.numeric(as.Date("2023-01-01"))
    rename("hash_key_2"="hash_key", "rn2"="rn")|>
    select(rn2, hash_key_2, TABLE, adm_age_rec, adm_date_rec, adm_date_rec_num , disch_date_rec0, disch_date_num_miss, dit_rec2, id_centro, tr_compliance_rec, plan_type, senda)|> 
    #dplyr::filter(motivodeegreso!="Derivación")|>
    data.table::as.data.table()
  
overlap_dates_C1_after_miss <- janitor::clean_names(
    sqldf::sqldf(
      "
      SELECT *
      FROM CONS_C1_df_dup_intervals_after_miss AS x
      INNER JOIN CONS_C1_df_dup_intervals_after_miss AS y 
      ON x.hash_key_2 = y.hash_key_2 
         AND x.rn2 < y.rn2  -- Avoids duplicates (eg.: x vs y and then y vs x)
         AND x.adm_date_rec_num < y.disch_date_num_miss  -- x Admitted before being admitted into another treatment
         AND x.disch_date_num_miss > y.adm_date_rec_num  -- x Discharged after being admitted in other
         "
    ))  |>
    `colnames<-`(c("rn_1", "hash_key_1", "ano_bd_1", "adm_age_1", "adm_date_1", "adm_date_rec_num_1", "disch_date_1", "disch_date_num_1", "dit_1", "id_centro_1", "tr_compliance_1", "plan_type_1", "senda_1",  "rn_2",
                   "hash_key_2", "ano_bd_2", "adm_age_2", "adm_date_2", "adm_date_rec_num_2", "disch_date_2", "disch_date_num_2", "dit_2", "id_centro_2", "tr_compliance_2", "plan_type_2", "senda_2")) 
  
    cat(paste0("Number of overlapped dates, observations: ", nrow(overlap_dates_C1_after_miss)),"\n")
    cat(paste0("Number of overlapped dates, RUNs: ", nrow(distinct(overlap_dates_C1_after_miss, hash_key_1))))
    #Number of overlapped dates, observations: 1546 june 2025; 1554, 1518, 1579
    #Number of overlapped dates, RUNs: 1405 june 2025; 1412, 1385, 1411

#The rows on the left originate from older databases.
CONS_C1_df_dup_overlaps_COMP_after_miss <- 
as_tidytable(overlap_dates_C1_after_miss)|>
  mutate(pair_id= paste0(rn_1,"_",rn_2))|> 
  mutate(same_id=ifelse(id_centro_1==id_centro_2,1,0))|>
  mutate(bd_2_earlier=ifelse(ano_bd_2>ano_bd_1,1,0))|> #es el dato de la derecha de una base de datos mas reciente.
  mutate(senda_status= case_when(senda_1=="si" & senda_2=="si"~ "both yes", senda_1=="no" & senda_2=="no"~ "both no", senda_1=="no" & senda_2=="si"~ "second yes", senda_1=="no" & senda_2=="no"~ "second no",  T~NA_character_))|>
  mutate(referral= ifelse(tr_compliance_1=="referral",1,0))|>
  mutate(days_overlapped=disch_date_num_1-adm_date_rec_num_2)|> # para que hayan dias positivos. Se supone que la fecha de egreso es más reciente que la fecha de ingreso del evento que superpone.
  mutate(more_dit=ifelse(dit_2>dit_1,1,0))|> #más días tratado en 2
  mutate(trat_1_within_2=ifelse(disch_date_num_1<disch_date_num_2 & adm_date_rec_num_1>adm_date_rec_num_2,1,0))|>
  select(-hash_key_2) |> 
  rename("hash_key"="hash_key_1")

####
CONS_C1_df_dup_overlaps_COMP_after_miss|>
      (\(df) {
        rio::export(df, "_out/_overlaps_dup_step_2_after_miss.xlsx") #for visual comparison in excel
        knitr::kable(filter(df, hash_key %in% pull(sample_n_with_seed(CONS_C1_df_dup_overlaps_COMP_after_miss,20, seed=2125),"hash_key"))|> mutate(hash_key= as.numeric(factor(hash_key))), format = "html", format.args = list(decimal.mark = ".", big.mark = ","), caption="Cases with overlapped treatment ranges (after correcting for missing discharge dates)", align = rep('c', 32),  
               #col.names = c("Row No.(1)", "HASH", "Year of\nDataset(1)", "Admission age(1)", "Admission\ndate(1)(a)", "Admission\ndate(1)(b)", "Discharge\ndate(1)(a)", "Discharge\ndate(1)(b)", "Treatment Days(1)", "Center ID(1)", "Cause of\nDischarge(1)", "Plan Type(1)", "SENDA(1)", "Row No.(2)", "Year of\nDataset(2)", "Admission age(2)", "Admission\ndate(2)(a)", "Admission\ndate(2)(b)", "Discharge\ndate(2)(a)", "Discharge\ndate(2)(b)", "Treatment Days(2)", "Center ID(2)", "Cause of\nDischarge(2)", "Plan Type(2)", "SENDA(2)", "Same Center ID", "Earlier Dataset \nof 2nd Treatment", "Financed \nBy SENDA", "Referral", "Days Overlapped", "2nd treatment has more treatment days", "1st treatment\nabsorbs 2nd")
  ) |>
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover"),font_size = 8)|>
  kableExtra::add_footnote( c("Note.Each row represents an overlap. Variables ending with '_1' are the first case, and variables ending with '_2' correspond to the second case;", "a= date; b= numeric", "Same Center ID= If both cases share the same Center ID",  "Financed By SENDA= If both cases are financed by SENDA;", "Referral= If the cause of discharge is the referral from another center (1= Referral);","Days Overlapped= Difference between the date of admission of the earlier treatment, and the date of discharge of the latter treatment","2nd treatment has more treatment days= Earlier treatment has more days of treatment"), notation = "none")|>
  kableExtra::scroll_box(width = "100%", height = "375px")
      })()
Number of overlapped dates, observations: 1546 
Number of overlapped dates, RUNs: 1405
Cases with overlapped treatment ranges (after correcting for missing discharge dates)
rn_1 hash_key ano_bd_1 adm_age_1 adm_date_1 adm_date_rec_num_1 disch_date_1 disch_date_num_1 dit_1 id_centro_1 tr_compliance_1 plan_type_1 senda_1 rn_2 ano_bd_2 adm_age_2 adm_date_2 adm_date_rec_num_2 disch_date_2 disch_date_num_2 dit_2 id_centro_2 tr_compliance_2 plan_type_2 senda_2 pair_id same_id bd_2_earlier senda_status referral days_overlapped more_dit trat_1_within_2
49,390 1 2014 35.81 2013-06-17 15,873 2014-01-27 16,097 224 166 late dropout pg-pab si 53,962 2014 36.35 2013-12-30 16,069 2014-01-03 16,073 4 163 early dropout m-pr si 49390_53962 0 0 both yes 0 28 0 0
78,289 2 2015 35.03 2015-06-18 16,604 2015-07-28 16,644 40 141 referral pg-pai si 80,895 2015 35.02 2015-06-12 16,598 2015-11-26 16,765 167 141 late dropout pg-pai si 78289_80895 1 0 both yes 1 46 1 1
79,294 2 2015 35.14 2015-07-28 16,644 2015-08-10 16,657 13 142 referral m-pr si 80,895 2015 35.02 2015-06-12 16,598 2015-11-26 16,765 167 141 late dropout pg-pai si 79294_80895 0 0 both yes 1 59 1 1
122,569 3 2017 37.58 2017-09-13 17,422 2018-01-23 17,554 132 132 referral pg-pr si 133,197 2018 37.89 2018-01-02 17,533 2018-01-31 17,562 29 726 referral pg-pr si 122569_133197 0 1 both yes 1 21 0 0
38,600 4 2013 25.30 2013-04-04 15,799 2013-04-23 15,818 19 238 referral pg-pai si 38,810 2013 25.34 2013-04-22 15,817 2013-11-11 16,020 203 258 late dropout pg-pr si 38600_38810 0 0 both yes 1 1 1 0
124,540 5 2017 28.46 2017-11-24 17,494 2018-01-23 17,554 60 132 referral pg-pr si 133,182 2018 28.57 2018-01-02 17,533 2018-07-06 17,718 185 726 completion pg-pr si 124540_133182 0 1 both yes 1 21 1 0
132,639 6 2018 64.13 2017-11-14 17,484 2018-03-29 17,619 135 291 referral pg-pai si 135,843 2018 64.47 2018-03-20 17,610 2019-01-31 17,927 317 303 late adm discharge pg-pr si 132639_135843 0 0 both yes 1 9 1 0
126,440 7 2018 28.25 2016-12-12 17,147 2018-07-04 17,716 569 465 completion pg-pab si 127,111 2018 28.48 2017-03-06 17,231 2018-10-08 17,812 581 263 referral m-pai si 126440_127111 0 0 both yes 0 485 1 0
109,974 8 2017 34.07 2016-10-04 17,078 2017-03-24 17,249 171 232 referral pg-pai si 113,052 2017 34.41 2017-02-07 17,204 2017-06-12 17,329 125 234 completion m-pr no 109974_113052 0 0 1 45 0 0
75,459 9 2015 22.99 2015-04-27 16,552 2015-09-08 16,686 134 166 late dropout pg-pai si 89,203 2016 23.35 2015-09-04 16,682 2016-02-23 16,854 172 163 late dropout pg-pr si 75459_89203 0 1 both yes 0 4 1 0
58,294 10 2014 23.99 2014-05-19 16,209 2014-10-29 16,372 163 143 late dropout pg-pai si 69,404 2015 24.39 2014-10-14 16,357 2019-12-31 18,261 1,904 364 adm truncated pg-pai si 58294_69404 0 1 both yes 0 15 1 0
36,916 11 2013 35.16 2013-02-13 15,749 2013-05-23 15,848 99 287 referral pg-pai si 65,608 2015 35.39 2013-05-08 15,833 2016-01-05 16,805 972 295 referral pg-pai si 36916_65608 0 1 both yes 1 15 1 0
23,744 12 2012 22.54 2012-01-02 15,341 2012-02-16 15,386 45 204 referral pg-pab si 24,662 2012 22.66 2012-02-14 15,384 2012-07-17 15,538 154 215 late dropout pg-pr si 23744_24662 0 0 both yes 1 2 1 0
200,444 13 2021 33.75 2021-07-26 18,834 2021-09-30 18,900 66 259 referral pg-pai si 209,379 2022 33.89 2021-09-14 18,884 2022-02-07 19,030 146 436 referral m-pai si 200444_209379 0 1 both yes 1 16 1 0
21,449 14 2012 39.12 2011-05-02 15,096 2012-05-02 15,462 366 337 referral pg-pai si 28,085 2012 40.10 2012-04-25 15,455 2012-07-03 15,524 69 258 early adm discharge pg-pr si 21449_28085 0 0 both yes 1 7 0 0
64,179 15 2014 37.82 2014-11-03 16,377 2015-01-27 16,462 85 141 completion pg-pab si 71,274 2015 38.00 2015-01-07 16,442 2015-12-17 16,786 344 141 completion pg-pr no 64179_71274 1 1 0 20 1 0
10,433 16 2011 46.26 2010-05-19 14,748 2011-07-20 15,175 427 104 late dropout m-pr no 17,877 2011 47.43 2011-07-18 15,173 2012-01-31 15,370 197 331 late dropout m-pai si 10433_17877 0 0 second yes 0 2 0 0
24,667 17 2012 31.20 2012-02-15 15,385 2012-05-18 15,478 93 161 referral m-pai si 27,109 2012 31.45 2012-05-17 15,477 2012-10-03 15,616 139 275 late dropout m-pr si 24667_27109 0 0 both yes 1 1 1 0
27,109 17 2012 31.45 2012-05-17 15,477 2012-10-03 15,616 139 275 late dropout m-pr si 48,001 2014 31.83 2012-10-02 15,615 2015-02-16 16,482 867 161 late dropout m-pai si 27109_48001 0 1 both yes 0 1 1 0
31,236 18 2012 25.11 2012-11-05 15,649 2013-01-01 15,706 57 206 referral pg-pab si 36,078 2013 25.24 2012-12-21 15,695 2013-02-27 15,763 68 205 referral pg-pai si 31236_36078 0 1 both yes 1 11 1 0
58,436 19 2014 34.43 2014-06-04 16,225 2014-06-23 16,244 19 432 early dropout m-pr si 67,549 2015 34.43 2014-06-02 16,223 2015-02-11 16,477 254 291 completion pg-pai si 58436_67549 0 1 both yes 0 21 1 1
17,323 20 2011 48.68 2011-06-23 15,148 2011-09-26 15,243 95 232 referral pg-pai si 18,975 2011 48.91 2011-09-13 15,230 2011-12-14 15,322 92 246 late dropout pg-pai si 17323_18975 0 0 both yes 1 13 0 0
Note.Each row represents an overlap. Variables ending with '_1' are the first case, and variables ending with '_2' correspond to the second case;
a= date; b= numeric
Same Center ID= If both cases share the same Center ID
Financed By SENDA= If both cases are financed by SENDA;
Referral= If the cause of discharge is the referral from another center (1= Referral);
Days Overlapped= Difference between the date of admission of the earlier treatment, and the date of discharge of the latter treatment
2nd treatment has more treatment days= Earlier treatment has more days of treatment

We then proceeded to correct the overlapping cases with missing discharge dates.

Four alternatives were delimited in order to resolve overlapped dates:

  • Impute treated days and replace the date of discharge
  • Keep the earliest treatment
  • Discard the earliest treatment
  • Subtract days to the date of discharge of the last treatment

0.b.4 Overlappings <= 30 days

Meanwhile, we focus in cases with overlap of less than 30 days.

Code
replace_disch_dates_0a<-
CONS_C1_df_dup_overlaps_COMP_after_miss|>
  filter(!(rn_1 %in% overlaps_after_miss_appear_more_than_one_time | 
         rn_2 %in% overlaps_after_miss_appear_more_than_one_time))|> 
  filter(days_overlapped<=30)|> 
      (\(df) {
    cat(paste0("4. Overlapping <= 30 days, cases: ", formatC(nrow(df), big.mark=",")),"\n")
    cat(paste0("4. Overlapping <= 30 days, RUNs: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
    df
  })()|>#1,019 #1002 # 1,022 1,005; 1,021 1,004 june 2025
  mutate(OBS=  case_when(adm_date_rec_num_1>adm_date_rec_num_2~ "4.3.0a.<=30 days overlapping, replaced w date of discharge of last treatment minus 1(first)", adm_date_rec_num_1< adm_date_rec_num_2~ "4.3.0a.<=30 days overlapping, replaced w date of discharge of last treatment minus 1(second)",T~NA_character_))|> 
  mutate(disch_date_num_2_rec=  case_when(adm_date_rec_num_1>adm_date_rec_num_2~ adm_date_rec_num_1-1, T~NA_real_))|> 
  mutate(disch_date_num_1_rec=  case_when(adm_date_rec_num_2>adm_date_rec_num_1~ adm_date_rec_num_2-1, T~NA_real_))
4. Overlapping <= 30 days, cases: 1,021 
4. Overlapping <= 30 days, RUNs: 1,004 

We apply the decision tree to the overlapping cases with less than 30 days of overlap.

Code
#replace_disch_dates_0a[, c("rn_1", "disch_date_num_2_rec", "OBS")]

SISTRAT23_c1_2010_2022_df_prev1j<-
  SISTRAT23_c1_2010_2022_df_prev1i|>
      (\(df) {
    cat(paste0("4. Database before correcting overlapping with <= 30 days of overlapping, cases: ", formatC(nrow(df), big.mark=",")),"\n")
    cat(paste0("4. Database before correcting overlapping with <= 30 days of overlapping, RUNs: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
    df
  })()|> 
  left_join(subset(replace_disch_dates_0a[, c("rn_2", "disch_date_num_2_rec", "OBS")], !is.na(disch_date_num_2_rec)), by=c("rn"="rn_2"), suffix= c("","_0a1"), first=T)|>
  left_join(subset(replace_disch_dates_0a[, c("rn_1", "disch_date_num_1_rec", "OBS")], !is.na(disch_date_num_1_rec)), by=c("rn"="rn_1"), suffix= c("","_0a2"), first=T)|>
  mutate(OBS= case_when(!is.na(OBS_0a1)~paste0(as.character(OBS),";",OBS_0a1), T~ OBS))|>
  mutate(OBS= case_when(!is.na(OBS_0a2)~paste0(as.character(OBS),";",OBS_0a2), T~ OBS))|> 
  mutate(disch_date_num_rec2= case_when(!is.na(disch_date_num_2_rec)~ disch_date_num_2_rec, !is.na(disch_date_num_1_rec)~ disch_date_num_1_rec, T~ disch_date_num_rec))|> 
  mutate(dit_rec3= disch_date_num_rec2- adm_date_rec_num)|>
  select(-disch_date_num_2_rec, -disch_date_num_1_rec, -OBS_0a1, -OBS_0a2)|>
        (\(df) {
    cat(paste0("4. Database after correcting overlapping with <= 30 days of overlapping, cases: ", formatC(nrow(df), big.mark=",")),"\n")
    cat(paste0("4. Database after correcting overlapping with <= 30 days of overlapping, RUNs: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
    if (nrow(df) > nrow(SISTRAT23_c1_2010_2022_df_prev1i))stop("Error: Added treatment episodes in the process")
    df
  })()
# 4. Database before correcting overlapping with <= 30 days of overlapping, cases: 150,182 
# 4. Database before correcting overlapping with <= 30 days of overlapping, RUNs: 106,283 
# 4. Database after correcting overlapping with <= 30 days of overlapping, cases: 150,182 
# 4. Database after correcting overlapping with <= 30 days of overlapping, RUNs: 106,283
4. Database before correcting overlapping with <= 30 days of overlapping, cases: 150,182 
4. Database before correcting overlapping with <= 30 days of overlapping, RUNs: 106,283 
4. Database after correcting overlapping with <= 30 days of overlapping, cases: 150,182 
4. Database after correcting overlapping with <= 30 days of overlapping, RUNs: 106,283 

We obtained the database SISTRAT23_c1_2010_2022_df_prev1j. We corrected the dates in disch_date_num_rec2, dit_rec3 to account for the new discharge date.

0.b.5 Treatment episode without a single day in treatment

We apply the detection of duplicates again and scenarios

Code
CONS_C1_df_dup_intervals_after_miss_less30d<- 
  SISTRAT23_c1_2010_2022_df_prev1j|>
    mutate(disch_date_num_miss= ifelse(is.na(disch_date_num_rec2), 19475, disch_date_num_rec2))|> #equivalente a 2023-04-28 as.numeric(as.Date("2023-01-01"))
    rename("hash_key_2"="hash_key", "rn2"="rn")|>
    select(rn2, hash_key_2, TABLE, adm_age_rec, adm_date_rec, adm_date_rec_num , disch_date_rec0, disch_date_num_miss, dit_rec3, id_centro, tr_compliance_rec, plan_type, senda)|> 
    #dplyr::filter(motivodeegreso!="Derivación")|>
    data.table::as.data.table()
  
overlap_dates_C1_after_miss_less30d <- janitor::clean_names(
    sqldf::sqldf(
      "
      SELECT *
      FROM CONS_C1_df_dup_intervals_after_miss_less30d AS x
      INNER JOIN CONS_C1_df_dup_intervals_after_miss_less30d AS y 
      ON x.hash_key_2 = y.hash_key_2 
         AND x.rn2 < y.rn2  -- Avoids duplicates (eg.: x vs y and then y vs x)
         AND x.adm_date_rec_num < y.disch_date_num_miss  -- x Admitted before being admitted into another treatment
         AND x.disch_date_num_miss > y.adm_date_rec_num  -- x Discharged after being admitted in other
         "
    ))  |>
    `colnames<-`(c("rn_1", "hash_key_1", "ano_bd_1", "adm_age_1", "adm_date_1", "adm_date_rec_num_1", "disch_date_1", "disch_date_num_1", "dit_1", "id_centro_1", "tr_compliance_1", "plan_type_1", "senda_1",  "rn_2", "hash_key_2", "ano_bd_2", "adm_age_2", "adm_date_2", "adm_date_rec_num_2", "disch_date_2", "disch_date_num_2", "dit_2", "id_centro_2", "tr_compliance_2", "plan_type_2", "senda_2")) 
  
    cat(paste0("Number of overlapped dates, observations: ", nrow(overlap_dates_C1_after_miss_less30d)),"\n")
    cat(paste0("Number of overlapped dates, RUNs: ", nrow(distinct(overlap_dates_C1_after_miss_less30d, hash_key_1))))
    #Number of overlapped dates, observations: 536 ; 560 ; 532 ; 525 june 2025
    #Number of overlapped dates, RUNs: 412 ; 420 ; 418 ; 412 june 2025

#The rows on the left originate from older databases.
CONS_C1_df_dup_overlaps_COMP_after_miss_less30d <- 
as_tidytable(overlap_dates_C1_after_miss_less30d)|>
  mutate(pair_id= paste0(rn_1,"_",rn_2))|> 
  mutate(same_id=ifelse(id_centro_1==id_centro_2,1,0))|>
  mutate(bd_2_earlier=ifelse(ano_bd_2>ano_bd_1,1,0))|> #es el dato de la derecha de una base de datos mas reciente.
  mutate(senda_status= case_when(senda_1=="si" & senda_2=="si"~ "both yes", senda_1=="no" & senda_2=="no"~ "both no", senda_1=="no" & senda_2=="si"~ "second yes", senda_1=="no" & senda_2=="no"~ "second no",  T~NA_character_))|>
  mutate(referral= ifelse(tr_compliance_1=="referral",1,0))|>
  mutate(days_overlapped=disch_date_num_1-adm_date_rec_num_2)|> # para que hayan dias positivos. Se supone que la fecha de egreso es más reciente que la fecha de ingreso del evento que superpone.
  mutate(more_dit=ifelse(dit_2>dit_1,1,0))|> #más días tratado en 2
  mutate(trat_1_within_2=ifelse(disch_date_num_1<disch_date_num_2 & adm_date_rec_num_1>adm_date_rec_num_2,1,0))|>
  select(-hash_key_2) |> 
  rename("hash_key"="hash_key_1")
Number of overlapped dates, observations: 525 
Number of overlapped dates, RUNs: 412

We apply the scenarios to the main database, discarding cases with less than one day in treatment.

Code
CONS_C1_df_dup_overlaps_COMP_after_miss_less30d |> 
  filter(dit_2<1| dit_2<1)|>
        (\(df) {
    cat(paste0("4. Less than one day in treatment, cases: ", formatC(nrow(df), big.mark=",")),"\n")
    cat(paste0("4. Less than one day in treatment, RUNs: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
  })()
# 4. Less than one day in treatment, cases: 1 
# 4. Less than one day in treatment, RUNs: 1 

discard_0c_0b<-
CONS_C1_df_dup_overlaps_COMP_after_miss_less30d |> 
  filter(dit_2<1| dit_2<1)|>
  mutate(rn= case_when(dit_1<1~ rn_1, dit_2<1~ rn_2, T~NA_real_)) |> 
  pull(rn)
keep_0c_0b<-
CONS_C1_df_dup_overlaps_COMP_after_miss_less30d |> 
  filter(dit_2<1| dit_2<1)|>
  mutate(rn= case_when(dit_1>0~ rn_1, dit_2>0~ rn_2, T~NA_real_)) |> 
  pull(rn)


#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:

SISTRAT23_c1_2010_2022_df_prev1k<-
  SISTRAT23_c1_2010_2022_df_prev1j|>
      (\(df) {
    cat(paste0("4. Database before discarding cases with less than one day in treatment, cases: ", formatC(nrow(df), big.mark=",")),"\n")
    cat(paste0("4. Database before discarding cases with less than one day in treatment, RUNs: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
    df
  })()|> 
  filter(!(rn %in% discard_0c_0b))|>
  mutate(OBS= case_when(rn %in% keep_0c_0b~ paste0(as.character(OBS),";","4.3.0b.Discard treatment episode with no days in treatment"), T~ OBS))|> 
        (\(df) {
    cat(paste0("4. Database after correcting overlapping with <= 30 days of overlapping, cases: ", formatC(nrow(df), big.mark=",")),"\n")
    cat(paste0("4. Database after correcting overlapping with <= 30 days of overlapping, RUNs: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
    if (nrow(df) > nrow(SISTRAT23_c1_2010_2022_df_prev1j))stop("Error: Added treatment episodes in the process")
    df
  })()
# 4. Database before discarding cases with less than one day in treatment, cases: 150,182 
# 4. Database before discarding cases with less than one day in treatment, RUNs: 106,283 
# 4. Database after correcting overlapping with <= 30 days of overlapping, cases: 150,181 
# 4. Database after correcting overlapping with <= 30 days of overlapping, RUNs: 106,283 
4. Less than one day in treatment, cases: 1 
4. Less than one day in treatment, RUNs: 1 
4. Database before discarding cases with less than one day in treatment, cases: 150,182 
4. Database before discarding cases with less than one day in treatment, RUNs: 106,283 
4. Database after correcting overlapping with <= 30 days of overlapping, cases: 150,181 
4. Database after correcting overlapping with <= 30 days of overlapping, RUNs: 106,283 

The new database is called SISTRAT23_c1_2010_2022_df_prev1k.

0.b.6 Citeria based on sharing center ID, SENDA financing status, treatment length and referral discharge

We apply the detection of duplicates again and scenarios

Code
CONS_C1_df_dup_intervals_after_miss_less30d_0d<- 
  SISTRAT23_c1_2010_2022_df_prev1k|>
    mutate(disch_date_num_miss= ifelse(is.na(disch_date_num_rec2), 19475, disch_date_num_rec2))|> #equivalente a 2023-04-28 as.numeric(as.Date("2023-01-01"))
    rename("hash_key_2"="hash_key", "rn2"="rn")|>
    select(rn2, hash_key_2, TABLE, adm_age_rec, adm_date_rec, adm_date_rec_num , disch_date_rec0, disch_date_num_miss, dit_rec3, id_centro, tr_compliance_rec, plan_type, senda)|> 
    #dplyr::filter(motivodeegreso!="Derivación")|>
    data.table::as.data.table()
  
overlap_dates_C1_after_miss_less30d_0d <- janitor::clean_names(
    sqldf::sqldf(
      "
      SELECT *
      FROM CONS_C1_df_dup_intervals_after_miss_less30d_0d AS x
      INNER JOIN CONS_C1_df_dup_intervals_after_miss_less30d_0d AS y 
      ON x.hash_key_2 = y.hash_key_2 
         AND x.rn2 < y.rn2  -- Avoids duplicates (eg.: x vs y and then y vs x)
         AND x.adm_date_rec_num < y.disch_date_num_miss  -- x Admitted before being admitted into another treatment
         AND x.disch_date_num_miss > y.adm_date_rec_num  -- x Discharged after being admitted in other
         "
    ))  |>
    `colnames<-`(c("rn_1", "hash_key_1", "ano_bd_1", "adm_age_1", "adm_date_1", "adm_date_rec_num_1", "disch_date_1", "disch_date_num_1", "dit_1", "id_centro_1", "tr_compliance_1", "plan_type_1", "senda_1",  "rn_2",
                   "hash_key_2", "ano_bd_2", "adm_age_2", "adm_date_2", "adm_date_rec_num_2", "disch_date_2", "disch_date_num_2", "dit_2", "id_centro_2", "tr_compliance_2", "plan_type_2", "senda_2")) 
  
    cat(paste0("Number of overlapped dates, observations: ", nrow(overlap_dates_C1_after_miss_less30d_0d)),"\n")
    cat(paste0("Number of overlapped dates, RUNs: ", nrow(distinct(overlap_dates_C1_after_miss_less30d_0d, hash_key_1))))
    #Number of overlapped dates, observations: 559 ; 524 june 2025
    #Number of overlapped dates, RUNs: 419 ; 411 june 2025

#The rows on the left originate from older databases.
CONS_C1_df_dup_overlaps_COMP_after_miss_less30d_0d <- 
as_tidytable(overlap_dates_C1_after_miss_less30d_0d)|>
  mutate(pair_id= paste0(rn_1,"_",rn_2))|> 
  mutate(same_id=ifelse(id_centro_1==id_centro_2,1,0))|>
  mutate(bd_2_earlier=ifelse(ano_bd_2>ano_bd_1,1,0))|> #es el dato de la derecha de una base de datos mas reciente.
  mutate(senda_status= case_when(senda_1=="si" & senda_2=="si"~ "both yes", senda_1=="no" & senda_2=="no"~ "both no", senda_1=="no" & senda_2=="si"~ "second yes", senda_1=="no" & senda_2=="no"~ "second no",  T~NA_character_))|>
  mutate(referral= ifelse(tr_compliance_1=="referral",1,0))|>
  mutate(days_overlapped=disch_date_num_1-adm_date_rec_num_2)|> # para que hayan dias positivos. Se supone que la fecha de egreso es más reciente que la fecha de ingreso del evento que superpone.
  mutate(more_dit=ifelse(dit_2>dit_1,1,0))|> #más días tratado en 2
  mutate(trat_1_within_2=ifelse(disch_date_num_1<disch_date_num_2 & adm_date_rec_num_1>adm_date_rec_num_2,1,0))|>
  mutate(trat_2_within_1=ifelse(disch_date_num_2<disch_date_num_1 & adm_date_rec_num_2>adm_date_rec_num_1,1,0))|>
  select(-hash_key_2) |> 
  rename("hash_key"="hash_key_1")
Number of overlapped dates, observations: 524 
Number of overlapped dates, RUNs: 411

In 2020, we followed this rules to discard overlapping cases. Now, we are trying to apply them sequentially rather than all at once.

Code
wdpath<-
paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))))

bpmn::bpmn(paste0(wdpath,"cons/_input/overlapped_ranges_decision_tree.bpmn"))

Decision Tree for the Discard of Overlapping Dates in Cases

If we check the IDs of the centers with the most overlapping treatment days, we can see some sort of pattern. We think that this could be related to small changes in treatment modality or setting within treatment centers, similar to internal referrals. Sometimes can be related to the change of centers due to termination of agreements with SENDA.

Code
SISTRAT23_c1_2010_2022_df_prev1k|>
    select(id_centro, nombre_centro_rec) |> 
    filter(id_centro %in% attr(rev(sort(table(c(CONS_C1_df_dup_overlaps_COMP_after_miss_less30d$id_centro_1, CONS_C1_df_dup_overlaps_COMP_after_miss_less30d$id_centro_2)))),"names")[1:20]) |> 
  distinct(id_centro, .keep_all=T) |> 
  knitr::kable("markdown", caption= "Most frequent treatment centers with overlapped treatment dates")
Most frequent treatment centers with overlapped treatment dates
id_centro nombre_centro_rec
148 cosam quilicura
238 cosam la pintana
118 cosam lota
294 cosam talagante
142 centro de trat. y rehab. para personas con consumo perjudicial o dependencia a alcohol y/o drogas colina (ct. colina pr)
166 cosam enrique paris
291 cosam melipilla
502 centro de responsabilidad de salud mental del complejo asistencial dr.victor rios ruiz
161 centro comunitario de salud mental familiar (cosam pudahuel)
123 cosam newen
122 hospital de tome, centro superarte
295 crs salvador allende
109 cosam concepcion
117 comunidad terapeutica villamavida
146 cosam lampa
147 comunidad terapeutica manresa
141 cosam colina
106 cosam nuble (cadem de chillan)
136 consultorio alejandro gutierrez
328 cosam alenmoguen

Hence, treatment center might be an important criteria to judge overlappings.

Code
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
cat("(==========================================================================)\n")
cat("Same center ID, Both tr. SENDA Yes/No, An episode in the middle of the other")
invisible("Yes")
CONS_C1_df_dup_overlaps_COMP_after_miss_less30d_0d |> 
  #discard multiple treatments
  filter(!(rn_1 %in% overlaps_after_miss_appear_more_than_one_time | 
         rn_2 %in% overlaps_after_miss_appear_more_than_one_time))|>
  #same CENTER ID
  filter(same_id==1)|> 
  #both financed/not financed by SENDA
  filter(grepl("both",senda_status))|> 
  #tr. in the middle of another
  filter(trat_1_within_2==1|trat_2_within_1==1)|>
  #select the row of the treatment to discard and the row of the treatment to keep
  mutate(rn_disc= case_when(trat_2_within_1==1~ rn_1, trat_1_within_2==1~ rn_2),
         rn_keep= case_when(trat_1_within_2==1~ rn_1, trat_2_within_1==1~ rn_2)) |> 
     (\(df) {
         cat(paste0("4.3a. same center, both financed/not financed by SENDA, one episode in the middle of the other, cases: ", formatC(nrow(df)*2, big.mark=",")),"\n")
         cat(paste0("4.3a. same center, both financed/not financed by SENDA, one episode in the middle of the other, RUNs: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
         pull(df, rn_disc) ->> row_3a_discard_1st_tr
         pull(df, rn_keep) ->> row_3a_keep_2nd_tr
     })() 
# 4.3a. same center, both financed/not financed by SENDA, one episode in the middle of the other, cases: 114 ; 116 june 2025
# 4.3a. same center, both financed/not financed by SENDA, one episode in the middle of the other, RUNs: 57 ; 58 june 2025

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
cat("Same center ID; Both treatments are SENDA Yes or No= No; oldest episode should be modified")
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
mod_4ab<-
CONS_C1_df_dup_overlaps_COMP_after_miss_less30d_0d |> 
  #discard multiple treatments
  filter(!(rn_1 %in% overlaps_after_miss_appear_more_than_one_time | 
         rn_2 %in% overlaps_after_miss_appear_more_than_one_time))|>
  #same CENTER ID
  filter(same_id==1)|> 
  #both financed/not financed by SENDA
  filter(grepl("both",senda_status))|> 
  #no tr. in the middle of another
  filter(trat_1_within_2==0 & trat_2_within_1==0)|>
  # referral in the cause of discharge of the treatment with latter discharge date (oldest)
  mutate(crit_4ab= case_when(adm_date_rec_num_2< adm_date_rec_num_1 & tr_compliance_2=="referral"~ "4a.1. subtract days to second episode", adm_date_rec_num_1< adm_date_rec_num_2 & tr_compliance_1=="referral"~ "4a.2. subtract days to first episode", adm_date_rec_num_2<adm_date_rec_num_1 & tr_compliance_2!="referral"~ "4b.1. change cause of discharge and subtract days to second episode", adm_date_rec_num_1<adm_date_rec_num_2 & tr_compliance_1!="referral"~ "4b.2. change cause of discharge and subtract days to first episode", T~NA_character_ ))|> 
  # which is the oldest
  mutate(oldest= case_when(adm_date_rec_num_1<adm_date_rec_num_2~"oldest_1", T~ "oldest_2")) |> 
  #select the row of the treatment to discard and the row of the treatment to keep
  #4a) change amount of days treated
  mutate(disch_date_num_rec_2= case_when(grepl("\\.1\\.", crit_4ab)~ adm_date_rec_num_1-1, T~disch_date_num_2), 
         disch_date_num_rec_1= case_when(grepl("\\.2\\.", crit_4ab)~ adm_date_rec_num_2-1, T~disch_date_num_1))|>
  #4b) also change cause of discharge,now only for those with another cause of discharge
  mutate(tr_compliance_rec_2= case_when(grepl("4b\\.1", crit_4ab)~ "referral", T~tr_compliance_2), 
         tr_compliance_rec_1= case_when(grepl("4b\\.2", crit_4ab)~ "referral", T~tr_compliance_1))|>
  mutate(rn_mod_1= case_when(grepl("\\.1\\.", crit_4ab)~ rn_1, grepl("\\.2\\.", crit_4ab)~ rn_2))|> 
     (\(df) {
         cat(paste0("4.4a. same center, both financed/not financed by SENDA, oldest episode had referral (2nd), cases: ", formatC(nrow(subset(df, grepl("^4a\\.1", crit_4ab)))*2, big.mark=",")),"\n")
         cat(paste0("4.4a. same center, both financed/not financed by SENDA, oldest episode had referral (2nd), RUNs: ", formatC(nrow(distinct(subset(df,grepl("^4a\\.1", crit_4ab)), hash_key)), big.mark=",")),"\n")
         cat(paste0("4.4a. same center, both financed/not financed by SENDA, oldest episode had referral (1st), cases: ", formatC(nrow(subset(df, grepl("^4a\\.2", crit_4ab)))*2, big.mark=",")),"\n")
         cat(paste0("4.4a. same center, both financed/not financed by SENDA, oldest episode had referral (1st), RUNs: ", formatC(nrow(distinct(subset(df,grepl("^4a\\.2", crit_4ab)), hash_key)), big.mark=",")),"\n")
         cat(paste0("4.4b. same center, both financed/not financed by SENDA, oldest episode w/o referral (2nd), cases: ", formatC(nrow(subset(df,grepl("^4b\\.1", crit_4ab)))*2, big.mark=",")),"\n")
         cat(paste0("4.4b. same center, both financed/not financed by SENDA, oldest episode w/o referral (2nd), RUNs: ", formatC(nrow(distinct(subset(df,grepl("^4b\\.1", crit_4ab)), hash_key)), big.mark=",")),"\n")
         cat(paste0("4.4b. same center, both financed/not financed by SENDA, oldest episode w/o referral (1st), cases: ", formatC(nrow(subset(df,grepl("^4b\\.2", crit_4ab)))*2, big.mark=",")),"\n")
         cat(paste0("4.4b. same center, both financed/not financed by SENDA, oldest episode w/o referral (1st), RUNs: ", formatC(nrow(distinct(subset(df,grepl("^4b\\.2", crit_4ab)), hash_key)), big.mark=",")),"\n")
         df
     })() |> 
  select(hash_key, rn_1, rn_2, oldest, disch_date_num_rec_1, disch_date_num_rec_2, tr_compliance_rec_1, tr_compliance_rec_2, crit_4ab)
# 4.4a. same center, both financed/not financed by SENDA, oldest episode had referral (2nd), cases: 2 
# 4.4a. same center, both financed/not financed by SENDA, oldest episode had referral (2nd), RUNs: 1 
# 4.4a. same center, both financed/not financed by SENDA, oldest episode had referral (1st), cases: 38 
# 4.4a. same center, both financed/not financed by SENDA, oldest episode had referral (1st), RUNs: 19 
# 4.4b. same center, both financed/not financed by SENDA, oldest episode w/o referral (2nd), cases: 4 
# 4.4b. same center, both financed/not financed by SENDA, oldest episode w/o referral (2nd), RUNs: 2 
# 4.4b. same center, both financed/not financed by SENDA, oldest episode w/o referral (1st), cases: 74 
# 4.4b. same center, both financed/not financed by SENDA, oldest episode w/o referral (1st), RUNs: 37 

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
cat("(==========================================================================)\n")
cat("2b-Same center ID, Different SENDA financing status")

CONS_C1_df_dup_overlaps_COMP_after_miss_less30d_0d |> 
  #discard multiple treatments
  filter(!(rn_1 %in% overlaps_after_miss_appear_more_than_one_time | 
         rn_2 %in% overlaps_after_miss_appear_more_than_one_time))|>
  #same CENTER ID
  filter(same_id==1)|> 
  #Different financing status
  filter(!grepl("both",senda_status))|>
  #select the row of the earliest treatment and check if it is financed by SENDA
  mutate(crit_2b12= case_when(adm_date_rec_num_2>adm_date_rec_num_1 & senda_2=="si"~ "2b1.1. earliest treatment (2nd) financed by SENDA", adm_date_rec_num_1>adm_date_rec_num_2 & senda_1=="si"~ "2b1.2. earliest treatment (1st) financed by SENDA", adm_date_rec_num_2>adm_date_rec_num_1 & senda_2=="no"~ "2b2.1. earliest treatment (2nd) not financed by SENDA", adm_date_rec_num_1>adm_date_rec_num_2 & senda_1=="no"~ "2b2.2. earliest treatment (1st) not financed by SENDA", T~NA_character_ ))|>  # janitor::tabyl(crit_2b12)
  #select the row of the treatment to discard and the row of the treatment to keep
  mutate(rn_keep_2b1= case_when(grepl("^2b1\\.1", crit_2b12)~ rn_2, grepl("^2b1\\.2", crit_2b12)~ rn_1, T~NA_real_)) |> 
  mutate(rn_disc_2b1= case_when(grepl("^2b1\\.1", crit_2b12)~ rn_1, grepl("^2b1\\.2", crit_2b12)~ rn_2, T~NA_real_)) |> 
  mutate(rn_keep_2b2= case_when(grepl("^2b2\\.1", crit_2b12)~ rn_1, grepl("^2b2\\.2", crit_2b12)~ rn_2, T~NA_real_)) |> 
  mutate(rn_disc_2b2= case_when(grepl("^2b2\\.1", crit_2b12)~ rn_2, grepl("^2b2\\.2", crit_2b12)~ rn_1, T~NA_real_)) |>   
     (\(df) {
         cat(paste0("4.2b1.1 same center, one financed by SENDA, earliest treatment (2nd), cases: ", formatC(nrow(subset(df, grepl("^2b1\\.1", crit_2b12)))*2, big.mark=",")),"\n")
         cat(paste0("4.2b1.1 same center, one financed by SENDA, earliest treatment (2nd), RUNs: ", formatC(nrow(distinct(subset(df,grepl("^2b1\\.1", crit_2b12)), hash_key)), big.mark=",")),"\n")
         cat(paste0("4.2b1.2 same center, one financed by SENDA, earliest treatment (1st), cases: ", formatC(nrow(subset(df, grepl("^2b1\\.2", crit_2b12)))*2, big.mark=",")),"\n")
         cat(paste0("4.2b1.2 same center, one financed by SENDA, earliest treatment (1st), RUNs: ", formatC(nrow(distinct(subset(df,grepl("^2b1\\.2", crit_2b12)), hash_key)), big.mark=",")),"\n")

         cat(paste0("4.2b2.1 same center, one financed by SENDA, not the earliest treatment (2nd), cases: ", formatC(nrow(subset(df,grepl("^2b2\\.1", crit_2b12)))*2, big.mark=",")),"\n")
         cat(paste0("4.2b2.1 same center, one financed by SENDA, not the earliest treatment (2nd), RUNs: ", formatC(nrow(distinct(subset(df,grepl("^2b2\\.1", crit_2b12)), hash_key)), big.mark=",")),"\n")
         cat(paste0("4.2b2.2 same center, one financed by SENDA, not the earliest treatment (1st), cases: ", formatC(nrow(subset(df,grepl("^2b2\\.2", crit_2b12)))*2, big.mark=",")),"\n")
         cat(paste0("4.2b2.2 same center, one financed by SENDA, not the earliest treatment (1st), RUNs: ", formatC(nrow(distinct(subset(df,grepl("^2b2\\.2", crit_2b12)), hash_key)), big.mark=",")),"\n")         
          pull(df, rn_keep_2b2) ->> row_2b2_discard_1st_tr
          pull(df, rn_keep_2b2) ->> row_2b2_keep_2nd_tr
          pull(df, rn_disc_2b1) ->> row_2b1_discard_1st_tr
          pull(df, rn_keep_2b1) ->> row_2b1_keep_2nd_tr
     })()
# 4.2b1.1 same center, one financed by SENDA, earliest treatment (2nd), cases: 4 
# 4.2b1.1 same center, one financed by SENDA, earliest treatment (2nd), RUNs: 2 
# 4.2b1.2 same center, one financed by SENDA, earliest treatment (1st), cases: 8 
# 4.2b1.2 same center, one financed by SENDA, earliest treatment (1st), RUNs: 4 
# 4.2b2.1 same center, one financed by SENDA, not the earliest treatment (2nd), cases: 14 
# 4.2b2.1 same center, one financed by SENDA, not the earliest treatment (2nd), RUNs: 7 
# 4.2b2.2 same center, one financed by SENDA, not the earliest treatment (1st), cases: 8 
# 4.2b2.2 same center, one financed by SENDA, not the earliest treatment (1st), RUNs: 4 
# at june 2025
# 4.2b1.1 same center, one financed by SENDA, earliest treatment (2nd), cases: 4 
# 4.2b1.1 same center, one financed by SENDA, earliest treatment (2nd), RUNs: 2 
# 4.2b1.2 same center, one financed by SENDA, earliest treatment (1st), cases: 8 
# 4.2b1.2 same center, one financed by SENDA, earliest treatment (1st), RUNs: 4 
# 4.2b2.1 same center, one financed by SENDA, not the earliest treatment (2nd), cases: 16 
# 4.2b2.1 same center, one financed by SENDA, not the earliest treatment (2nd), RUNs: 8 
# 4.2b2.2 same center, one financed by SENDA, not the earliest treatment (1st), cases: 10 
# 4.2b2.2 same center, one financed by SENDA, not the earliest treatment (1st), RUNs: 5 

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
cat("(==========================================================================)\n")
cat("35ab- Different center IDs; same SENDA financing statuses (Yes/No); one episode in the middle of the other; and the earliest treatment comes from a more recent yearly database")

CONS_C1_df_dup_overlaps_COMP_after_miss_less30d_0d |> 
  #discard multiple treatments
  filter(!(rn_1 %in% overlaps_after_miss_appear_more_than_one_time | 
         rn_2 %in% overlaps_after_miss_appear_more_than_one_time))|>
  #different CENTER ID
  filter(same_id!=1)|> 
  #Same financing status
  filter(grepl("both",senda_status))|>
  #tr. in the middle of another
  filter(trat_1_within_2==1|trat_2_within_1==1)|>
  #earliest treatment comes from a more recent yearly database
  #select the row of the treatment to discard and the row of the treatment to keep
  #filter(case_when(trat_2_within_1==1 & ano_bd_1> ano_bd_2~T,T~F)) |> View()
  #Treatments with retrieval dates earlier within a treatment might be strange
  mutate(rn_disc_35a= case_when(trat_2_within_1==1 & ano_bd_1> ano_bd_2~ rn_2, trat_1_within_2==1 & ano_bd_2> ano_bd_1~ rn_1, T~NA_real_))|> 
  mutate(rn_disc_35b= case_when(trat_2_within_1==1 & ano_bd_1<= ano_bd_2~ rn_2, trat_1_within_2==1 & ano_bd_2<= ano_bd_1~ rn_1, T~NA_real_))|>
  mutate(rn_keep_35a= case_when(trat_2_within_1==1 & ano_bd_1> ano_bd_2~ rn_1, trat_1_within_2==1 & ano_bd_2> ano_bd_1~ rn_2, T~NA_real_))|> 
  mutate(rn_keep_35b= case_when(trat_2_within_1==1 & ano_bd_1<= ano_bd_2~ rn_1, trat_1_within_2==1 & ano_bd_2<= ano_bd_1~ rn_2, T~NA_real_))|>  
     (\(df) {
         pull(df, rn_disc_35a) ->> row_35a_disc_check_after
         pull(df, rn_keep_35a) ->> row_35a_keep_check_after
         pull(df, rn_disc_35b) ->> row_35b_discard_shortest
         pull(df, rn_keep_35b) ->> row_35b_keep_largest         
     })()
cat(paste0("4.35a. different center, both financed/not financed by SENDA, one episode in the middle of the other, retrieval date was the earliest, cases: ", formatC(length(row_35a_keep_check_after[!is.na(row_35a_keep_check_after)])*2, big.mark=",")),"\n")
cat(paste0("4.35a. different center, both financed/not financed by SENDA, one episode in the middle of the other, retrieval date was the earliest, RUNs: ", formatC(length(row_35a_keep_check_after[!is.na(row_35a_keep_check_after)]), big.mark=",")),"\n")
cat(paste0("4.35b. different center, both financed/not financed by SENDA, one episode in the middle of the other, retrieval date was not earliest, cases: ", formatC(length(row_35b_keep_largest[!is.na(row_35b_keep_largest)])*2, big.mark=",")),"\n")
cat(paste0("4.35b. different center, both financed/not financed by SENDA, one episode in the middle of the other, retrieval date was not earliest, RUNs: ", formatC(length(row_35b_keep_largest[!is.na(row_35b_keep_largest)]), big.mark=",")),"\n")

# june 2025
# 4.35a. different center, both financed/not financed by SENDA, one episode in the middle of the other, retrieval date was the earliest, cases: 68 
# 4.35a. different center, both financed/not financed by SENDA, one episode in the middle of the other, retrieval date was the earliest, RUNs: 34 
# 4.35b. different center, both financed/not financed by SENDA, one episode in the middle of the other, retrieval date was not earliest, cases: 54 
# 4.35b. different center, both financed/not financed by SENDA, one episode in the middle of the other, retrieval date was not earliest, RUNs: 27 


#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
cat("(==========================================================================)\n")

cat(paste0("35a. different center, both financed/not financed by SENDA, one episode in the middle of the other, retrieval date was the earliest, tr. length > 3 yrs., cases: "))
CONS_C1_df_dup_overlaps_COMP_after_miss_less30d_0d|> 
  filter(
    rn_1 %in% row_35a_keep_check_after[!is.na(row_35a_keep_check_after)] | 
    rn_2 %in% row_35a_keep_check_after[!is.na(row_35a_keep_check_after)]
  )|> 
  filter(case_when(trat_2_within_1==1 & ano_bd_1> ano_bd_2 & dit_1>1094~ T, T~ F))|> 
  nrow()

cat(paste0("35a. different center, both financed/not financed by SENDA, one episode in the middle of the other, retrieval date was the earliest, tr. length > 3 yrs., cases: "))
CONS_C1_df_dup_overlaps_COMP_after_miss_less30d_0d|> 
  filter(
    rn_1 %in% row_35a_keep_check_after[!is.na(row_35a_keep_check_after)] | 
    rn_2 %in% row_35a_keep_check_after[!is.na(row_35a_keep_check_after)]
  )|> 
  filter(case_when(trat_1_within_2==1 & ano_bd_2> ano_bd_1 & dit_2>1094~ T, T~ F))|> 
  nrow()

cat("The event of the left is the one that should be modified (aberant and largest treatment)")
replace_disch_date_35a21<-
CONS_C1_df_dup_overlaps_COMP_after_miss_less30d_0d |> 
  filter(
    rn_1 %in% row_35a_keep_check_after[!is.na(row_35a_keep_check_after)] | 
    rn_2 %in% row_35a_keep_check_after[!is.na(row_35a_keep_check_after)]
  )|> 
  filter(case_when(trat_2_within_1==1 & ano_bd_1> ano_bd_2 & dit_1>1094~ T, T~ F))|>
  mutate(disch_date_num_rec_35a21= adm_date_rec_num_1-1)|> 
  select(rn_1, disch_date_num_rec_35a21)

cat("The event of the right (rn_2) is the one that should be modified (aberant and largest treatment)")
replace_disch_date_35a22<-
CONS_C1_df_dup_overlaps_COMP_after_miss_less30d_0d|> 
  filter(
    rn_1 %in% row_35a_keep_check_after[!is.na(row_35a_keep_check_after)] | 
    rn_2 %in% row_35a_keep_check_after[!is.na(row_35a_keep_check_after)]
  )|> 
  filter(case_when(trat_1_within_2==1 & ano_bd_2> ano_bd_1 & dit_2>1094~ T, T~ F))|> 
  mutate(disch_date_num_rec_35a22= adm_date_rec_num_1-1)|> 
  select(rn_2, disch_date_num_rec_35a22)


#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
cat("(==========================================================================)\n")
cat("2_4ab- Different center IDs; same SENDA financing statuses (Yes/No); no treatment in the middle of another; latest (not the earlier) treatment have a referral for cause of discharge; oldest episode should be modified")
mod_2_4ab<-
CONS_C1_df_dup_overlaps_COMP_after_miss_less30d_0d |> 
  #discard multiple treatments
  filter(!(rn_1 %in% overlaps_after_miss_appear_more_than_one_time | 
         rn_2 %in% overlaps_after_miss_appear_more_than_one_time))|>
  #different CENTER ID
  filter(same_id!=1)|> 
  #Same financing status
  filter(grepl("both",senda_status))|>
  #tr. not in the middle of another
  filter(trat_1_within_2==0 & trat_2_within_1==0)|>
  # referral in the cause of discharge of the treatment with latter discharge date (oldest)
  mutate(crit_2_4ab= case_when(adm_date_rec_num_2<adm_date_rec_num_1 & tr_compliance_2=="referral"~ "2_4a.1. subtract days to second episode", adm_date_rec_num_1<adm_date_rec_num_2 & tr_compliance_1=="referral"~ "2_4a.2. subtract days to first episode", adm_date_rec_num_2<adm_date_rec_num_1 & tr_compliance_2!="referral"~ "2_4b.1. change cause of discharge and subtract days to second episode", adm_date_rec_num_1<adm_date_rec_num_2 & tr_compliance_1!="referral"~ "2_4b.2. change cause of discharge and subtract days to first episode", T~NA_character_ ))|> #janitor::tabyl(crit_2_4ab)
    # which is the oldest
  mutate(oldest= case_when(adm_date_rec_num_1<adm_date_rec_num_2~"oldest_1", T~ "oldest_2"))|> 
  #select the row of the treatment to discard and the row of the treatment to keep
  #4a) change amount of days treated
  mutate(disch_date_num_rec_2= case_when(grepl("\\.1\\.", crit_2_4ab)~ adm_date_rec_num_1-1, T~disch_date_num_2), 
         disch_date_num_rec_1= case_when(grepl("\\.2\\.", crit_2_4ab)~ adm_date_rec_num_2-1, T~disch_date_num_1))|>
  #4b) also change cause of discharge,now only for those with another cause of discharge
  mutate(tr_compliance_rec_2= case_when(grepl("2_4b\\.1", crit_2_4ab)~ "referral", T~tr_compliance_2), 
         tr_compliance_rec_1= case_when(grepl("2_4b\\.2", crit_2_4ab)~ "referral", T~tr_compliance_1))|>
  mutate(rn_mod_1= case_when(grepl("\\.1\\.", crit_2_4ab)~ rn_1, grepl("\\.2\\.", crit_2_4ab)~ rn_2))|> 
     (\(df) {
         cat(paste0("4.2_4a. different center, both financed/not financed by SENDA, oldest episode had referral (2nd), cases: ", formatC(nrow(subset(df, grepl("^2_4a\\.1", crit_2_4ab)))*2, big.mark=",")),"\n")
         cat(paste0("4.2_4a. different center, both financed/not financed by SENDA, oldest episode had referral (2nd), RUNs: ", formatC(nrow(distinct(subset(df,grepl("^2_4a\\.1", crit_2_4ab)), hash_key)), big.mark=",")),"\n")
         cat(paste0("4.2_4a. different center, both financed/not financed by SENDA, oldest episode had referral (1st), cases: ", formatC(nrow(subset(df, grepl("^2_4a\\.2", crit_2_4ab)))*2, big.mark=",")),"\n")
         cat(paste0("4.2_4a. different center, both financed/not financed by SENDA, oldest episode had referral (1st), RUNs: ", formatC(nrow(distinct(subset(df,grepl("^2_4a\\.2", crit_2_4ab)), hash_key)), big.mark=",")),"\n")
         cat(paste0("4.2_4b. different center, both financed/not financed by SENDA, oldest episode w/o referral (2nd), cases: ", formatC(nrow(subset(df,grepl("^2_4b\\.1", crit_2_4ab)))*2, big.mark=",")),"\n")
         cat(paste0("4.2_4b. different center, both financed/not financed by SENDA, oldest episode w/o referral (2nd), RUNs: ", formatC(nrow(distinct(subset(df,grepl("^2_4b\\.1", crit_2_4ab)), hash_key)), big.mark=",")),"\n")
         cat(paste0("4.2_4b. different center, both financed/not financed by SENDA, oldest episode w/o referral (1st), cases: ", formatC(nrow(subset(df,grepl("^2_4b\\.2", crit_2_4ab)))*2, big.mark=",")),"\n")
         cat(paste0("4.2_4b. different center, both financed/not financed by SENDA, oldest episode w/o referral (1st), RUNs: ", formatC(nrow(distinct(subset(df,grepl("^2_4b\\.2", crit_2_4ab)), hash_key)), big.mark=",")),"\n")
         df
     })()|> 
  select(hash_key, rn_1, rn_2, oldest, disch_date_num_rec_1, disch_date_num_rec_2, tr_compliance_rec_1, tr_compliance_rec_2, crit_2_4ab)
  #select the row of the treatment to discard and the row of the treatment to keep

# 4.2_4a. different center, both financed/not financed by SENDA, oldest episode had referral (2nd), cases: 2 
# 4.2_4a. different center, both financed/not financed by SENDA, oldest episode had referral (2nd), RUNs: 1 
# 4.2_4a. different center, both financed/not financed by SENDA, oldest episode had referral (1st), cases: 118 
# 4.2_4a. different center, both financed/not financed by SENDA, oldest episode had referral (1st), RUNs: 59 
# 4.2_4b. different center, both financed/not financed by SENDA, oldest episode w/o referral (2nd), cases: 0 
# 4.2_4b. different center, both financed/not financed by SENDA, oldest episode w/o referral (2nd), RUNs: 0 
# 4.2_4b. different center, both financed/not financed by SENDA, oldest episode w/o referral (1st), cases: 66 
# 4.2_4b. different center, both financed/not financed by SENDA, oldest episode w/o referral (1st), RUNs: 33 

# june 2025
# 4.2_4a. different center, both financed/not financed by SENDA, oldest episode had referral (2nd), cases: 2 
# 4.2_4a. different center, both financed/not financed by SENDA, oldest episode had referral (2nd), RUNs: 1 
# 4.2_4a. different center, both financed/not financed by SENDA, oldest episode had referral (1st), cases: 118 
# 4.2_4a. different center, both financed/not financed by SENDA, oldest episode had referral (1st), RUNs: 59 
# 4.2_4b. different center, both financed/not financed by SENDA, oldest episode w/o referral (2nd), cases: 0 
# 4.2_4b. different center, both financed/not financed by SENDA, oldest episode w/o referral (2nd), RUNs: 0 
# 4.2_4b. different center, both financed/not financed by SENDA, oldest episode w/o referral (1st), cases: 68 
# 4.2_4b. different center, both financed/not financed by SENDA, oldest episode w/o referral (1st), RUNs: 34 

#                                                           crit_2_4ab  n    percent
#                             2_4a.1. substract days to second episode  1 0.01075269
#                              2_4a.2. substract days to first episode 59 0.63440860
# 2_4b.2. change cause of discharge and substract days to first episode 33 0.35483871


#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
cat("(==========================================================================)\n")
cat("2_2b-Different center ID, Different SENDA financing status, earlier treatment financed by SENDA")


CONS_C1_df_dup_overlaps_COMP_after_miss_less30d_0d|> 
  #discard multiple treatments
  filter(!(rn_1 %in% overlaps_after_miss_appear_more_than_one_time | 
         rn_2 %in% overlaps_after_miss_appear_more_than_one_time))|>
  #different CENTER ID
  filter(same_id!=1)|> 
  #Different financing status
  filter(!grepl("both",senda_status))|>
  #select the row of the earliest treatment and check if it is financed by SENDA
  mutate(crit_2_2b12= case_when(adm_date_rec_num_2>adm_date_rec_num_1 & senda_2=="si"~ "2_2b1.1. earliest treatment (2nd) financed by SENDA", adm_date_rec_num_1>adm_date_rec_num_2 & senda_1=="si"~ "2_2b1.2. earliest treatment (1st) financed by SENDA", adm_date_rec_num_2>adm_date_rec_num_1 & senda_2=="no"~ "2_2b2.1. earliest treatment (2nd) not financed by SENDA", adm_date_rec_num_1>adm_date_rec_num_2 & senda_1=="no"~ "2_2b2.2. earliest treatment (1st) not financed by SENDA", T~NA_character_ ))|>  # janitor::tabyl(crit_2b12)
  #select the row of the treatment to discard and the row of the treatment to keep
  mutate(rn_keep= case_when(grepl("^2_2b1\\.1", crit_2_2b12)~ rn_2, grepl("^2_2b1\\.2", crit_2_2b12)~ rn_1, grepl("^2b2\\.1", crit_2_2b12)~ rn_2, grepl("^2_2b2\\.2", crit_2_2b12)~ rn_1))|> 
  mutate(rn_disc= case_when(grepl("^2_2b1\\.1", crit_2_2b12)~ rn_1, grepl("^2_2b1\\.2", crit_2_2b12)~ rn_2, grepl("^2b2\\.1", crit_2_2b12)~ rn_1, grepl("^2_2b2\\.2", crit_2_2b12)~ rn_2))|> #janitor::tabyl(crit_2_2b12)
     (\(df) {
         cat(paste0("4.2_2b1.1 different center, one financed by SENDA, earliest treatment (2nd), cases: ", formatC(nrow(subset(df, grepl("^2_2b1\\.1", crit_2_2b12)))*2, big.mark=",")),"\n")
         cat(paste0("4.2_2b1.1 different center, one financed by SENDA, earliest treatment (2nd), RUNs: ", formatC(nrow(distinct(subset(df,grepl("^2_2b1\\.1", crit_2_2b12)), hash_key)), big.mark=",")),"\n")
         cat(paste0("4.2_2b1.2 different center, one financed by SENDA, earliest treatment (1st), cases: ", formatC(nrow(subset(df, grepl("^2_2b1\\.2", crit_2_2b12)))*2, big.mark=",")),"\n")
         cat(paste0("4.2_2b1.2 different center, one financed by SENDA, earliest treatment (1st), RUNs: ", formatC(nrow(distinct(subset(df,grepl("^2_2b1\\.2", crit_2_2b12)), hash_key)), big.mark=",")),"\n")

         cat(paste0("4.2_2b2.1 different center, one financed by SENDA, not the earliest treatment (2nd), cases: ", formatC(nrow(subset(df,grepl("^2_2b2\\.1", crit_2_2b12)))*2, big.mark=",")),"\n")
         cat(paste0("4.2_2b2.1 different center, one financed by SENDA, not the earliest treatment (2nd), RUNs: ", formatC(nrow(distinct(subset(df,grepl("^2_2b2\\.1", crit_2_2b12)), hash_key)), big.mark=",")),"\n")
         cat(paste0("4.2_2b2.2 different center, one financed by SENDA, not the earliest treatment (1st), cases: ", formatC(nrow(subset(df,grepl("^2_2b2\\.2", crit_2_2b12)))*2, big.mark=",")),"\n")
         cat(paste0("4.2_2b2.2 different center, one financed by SENDA, not the earliest treatment (1st), RUNs: ", formatC(nrow(distinct(subset(df,grepl("^2_2b2\\.2", crit_2_2b12)), hash_key)), big.mark=",")),"\n")         
         pull(df, rn_disc) ->> row_2_2b_discard_1st_tr
         pull(df, rn_keep) ->> row_2_2b_keep_2nd_tr         
     })()
# 4.2_2b1.1 different center, one financed by SENDA, earliest treatment (2nd), cases: 4 
# 4.2_2b1.1 different center, one financed by SENDA, earliest treatment (2nd), RUNs: 2 
# 4.2_2b1.2 different center, one financed by SENDA, earliest treatment (1st), cases: 16 
# 4.2_2b1.2 different center, one financed by SENDA, earliest treatment (1st), RUNs: 8 
# 4.2_2b2.1 different center, one financed by SENDA, not the earliest treatment (2nd), cases: 70 
# 4.2_2b2.1 different center, one financed by SENDA, not the earliest treatment (2nd), RUNs: 35 
# 4.2_2b2.2 different center, one financed by SENDA, not the earliest treatment (1st), cases: 22 
# 4.2_2b2.2 different center, one financed by SENDA, not the earliest treatment (1st), RUNs: 11 

# june 2025
# 4.2_2b1.1 different center, one financed by SENDA, earliest treatment (2nd), cases: 6 
# 4.2_2b1.1 different center, one financed by SENDA, earliest treatment (2nd), RUNs: 3 
# 4.2_2b1.2 different center, one financed by SENDA, earliest treatment (1st), cases: 16 
# 4.2_2b1.2 different center, one financed by SENDA, earliest treatment (1st), RUNs: 8 
# 4.2_2b2.1 different center, one financed by SENDA, not the earliest treatment (2nd), cases: 72 
# 4.2_2b2.1 different center, one financed by SENDA, not the earliest treatment (2nd), RUNs: 36 
# 4.2_2b2.2 different center, one financed by SENDA, not the earliest treatment (1st), cases: 22 
# 4.2_2b2.2 different center, one financed by SENDA, not the earliest treatment (1st), RUNs: 11 
(==========================================================================)
Same center ID, Both tr. SENDA Yes/No, An episode in the middle of the other4.3a. same center, both financed/not financed by SENDA, one episode in the middle of the other, cases: 116 
4.3a. same center, both financed/not financed by SENDA, one episode in the middle of the other, RUNs: 58 
Same center ID; Both treatments are SENDA Yes or No= No; oldest episode should be modified4.4a. same center, both financed/not financed by SENDA, oldest episode had referral (2nd), cases: 2 
4.4a. same center, both financed/not financed by SENDA, oldest episode had referral (2nd), RUNs: 1 
4.4a. same center, both financed/not financed by SENDA, oldest episode had referral (1st), cases: 38 
4.4a. same center, both financed/not financed by SENDA, oldest episode had referral (1st), RUNs: 19 
4.4b. same center, both financed/not financed by SENDA, oldest episode w/o referral (2nd), cases: 4 
4.4b. same center, both financed/not financed by SENDA, oldest episode w/o referral (2nd), RUNs: 2 
4.4b. same center, both financed/not financed by SENDA, oldest episode w/o referral (1st), cases: 74 
4.4b. same center, both financed/not financed by SENDA, oldest episode w/o referral (1st), RUNs: 37 
(==========================================================================)
2b-Same center ID, Different SENDA financing status4.2b1.1 same center, one financed by SENDA, earliest treatment (2nd), cases: 4 
4.2b1.1 same center, one financed by SENDA, earliest treatment (2nd), RUNs: 2 
4.2b1.2 same center, one financed by SENDA, earliest treatment (1st), cases: 8 
4.2b1.2 same center, one financed by SENDA, earliest treatment (1st), RUNs: 4 
4.2b2.1 same center, one financed by SENDA, not the earliest treatment (2nd), cases: 16 
4.2b2.1 same center, one financed by SENDA, not the earliest treatment (2nd), RUNs: 8 
4.2b2.2 same center, one financed by SENDA, not the earliest treatment (1st), cases: 10 
4.2b2.2 same center, one financed by SENDA, not the earliest treatment (1st), RUNs: 5 
(==========================================================================)
35ab- Different center IDs; same SENDA financing statuses (Yes/No); one episode in the middle of the other; and the earliest treatment comes from a more recent yearly database4.35a. different center, both financed/not financed by SENDA, one episode in the middle of the other, retrieval date was the earliest, cases: 68 
4.35a. different center, both financed/not financed by SENDA, one episode in the middle of the other, retrieval date was the earliest, RUNs: 34 
4.35b. different center, both financed/not financed by SENDA, one episode in the middle of the other, retrieval date was not earliest, cases: 54 
4.35b. different center, both financed/not financed by SENDA, one episode in the middle of the other, retrieval date was not earliest, RUNs: 27 
(==========================================================================)
35a. different center, both financed/not financed by SENDA, one episode in the middle of the other, retrieval date was the earliest, tr. length > 3 yrs., cases: [1] 0
35a. different center, both financed/not financed by SENDA, one episode in the middle of the other, retrieval date was the earliest, tr. length > 3 yrs., cases: [1] 4
The event of the left is the one that should be modified (aberant and largest treatment)The event of the right (rn_2) is the one that should be modified (aberant and largest treatment)(==========================================================================)
2_4ab- Different center IDs; same SENDA financing statuses (Yes/No); no treatment in the middle of another; latest (not the earlier) treatment have a referral for cause of discharge; oldest episode should be modified4.2_4a. different center, both financed/not financed by SENDA, oldest episode had referral (2nd), cases: 2 
4.2_4a. different center, both financed/not financed by SENDA, oldest episode had referral (2nd), RUNs: 1 
4.2_4a. different center, both financed/not financed by SENDA, oldest episode had referral (1st), cases: 118 
4.2_4a. different center, both financed/not financed by SENDA, oldest episode had referral (1st), RUNs: 59 
4.2_4b. different center, both financed/not financed by SENDA, oldest episode w/o referral (2nd), cases: 0 
4.2_4b. different center, both financed/not financed by SENDA, oldest episode w/o referral (2nd), RUNs: 0 
4.2_4b. different center, both financed/not financed by SENDA, oldest episode w/o referral (1st), cases: 68 
4.2_4b. different center, both financed/not financed by SENDA, oldest episode w/o referral (1st), RUNs: 34 
(==========================================================================)
2_2b-Different center ID, Different SENDA financing status, earlier treatment financed by SENDA4.2_2b1.1 different center, one financed by SENDA, earliest treatment (2nd), cases: 6 
4.2_2b1.1 different center, one financed by SENDA, earliest treatment (2nd), RUNs: 3 
4.2_2b1.2 different center, one financed by SENDA, earliest treatment (1st), cases: 16 
4.2_2b1.2 different center, one financed by SENDA, earliest treatment (1st), RUNs: 8 
4.2_2b2.1 different center, one financed by SENDA, not the earliest treatment (2nd), cases: 72 
4.2_2b2.1 different center, one financed by SENDA, not the earliest treatment (2nd), RUNs: 36 
4.2_2b2.2 different center, one financed by SENDA, not the earliest treatment (1st), cases: 22 
4.2_2b2.2 different center, one financed by SENDA, not the earliest treatment (1st), RUNs: 11 

We apply these rules into the dataset and check overlappings again

Code
# 2_4ab- Different center IDs; same SENDA financing statuses (Yes/No); no treatment in the middle of another; latest (not the earlier) treatment have a referral for cause of discharge
# 4.2_4a. different center, both financed/not financed by SENDA, oldest episode had referral
# 4.2_4b. different center, both financed/not financed by SENDA, oldest episode w/o referral 
# 2_2b-Different center ID, Different SENDA financing status, earlier treatment financed by SENDA
# 2_2b1.1 different center, one financed by SENDA, earliest treatment 
# 2_2b2.1 different center, one financed by SENDA, not the earliest treatment 


#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:

SISTRAT23_c1_2010_2022_df_prev1l<-
  SISTRAT23_c1_2010_2022_df_prev1k|>
      (\(df) {
    cat(paste0("4. Database before apply rules based on center ID, SENDA financing status, referral cause and treatment length, cases: ", formatC(nrow(df), big.mark=",")),"\n")
    cat(paste0("4. Database before apply rules based on center ID, SENDA financing status, referral cause and treatment length, RUNs: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
    df
  })()|> 
  #:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
  filter(!(rn %in% row_3a_discard_1st_tr))|>
  mutate(OBS= case_when(rn %in% row_3a_keep_2nd_tr~ paste0(as.character(OBS),";","4.3a.Same center ID, same SENDA financing status, one tr. episode in the middle of the other. Discarded shorter episode"), T~ OBS))|> 
  #;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;
  left_join(mod_4ab, by=c("rn"="rn_1"), suffix=c("","_4ab_1st_ep"))|> 
  left_join(mod_4ab, by=c("rn"="rn_2"), suffix=c("","_4ab_2nd_ep"))|> 
  #given it is paired with row numbers. The matching rownumbers will be those paired
  mutate(disch_date_num_rec3= case_when(oldest=="oldest_1"~ disch_date_num_rec_1, T~ disch_date_num_rec2))|>
  #because the second episode matched with the row number of the second column
  mutate(disch_date_num_rec3= case_when(oldest_4ab_2nd_ep=="oldest_2"~ disch_date_num_rec_2_4ab_2nd_ep, T~ disch_date_num_rec3))|> 
  # change cause of discharge to referral
  mutate(tr_compliance_rec= case_when(oldest=="oldest_1"~ "referral", T~ tr_compliance_rec))|>
  mutate(tr_compliance_rec= case_when(oldest_4ab_2nd_ep=="oldest_2"~ "referral", T~ tr_compliance_rec))|>
  # add observations to rows.
  mutate(OBS= case_when(oldest== "oldest_1" & grepl("^4b", crit_4ab)~ paste0(as.character(OBS),";","4.4b.Same center ID, same SENDA financing status, subtracted days to oldest episode and change discharge cause to referral"), T~ OBS))|>
  mutate(OBS= case_when(oldest== "oldest_1" & grepl("^4a", crit_4ab)~ paste0(as.character(OBS),";","4.4a.Same center ID, same SENDA financing status, subtracted days to oldest episode"), T~ OBS))|>
  mutate(OBS= case_when(oldest== "oldest_2" & grepl("^4b", crit_4ab_4ab_2nd_ep)~ paste0(as.character(OBS),";","4.4b.Same center ID, same SENDA financing status, subtracted days to oldest episode and change discharge cause to referral"), T~ OBS))|>
  mutate(OBS= case_when(oldest== "oldest_2" & grepl("^4a", crit_4ab_4ab_2nd_ep)~ paste0(as.character(OBS),";","4.4a.Same center ID, same SENDA financing status, subtracted days to oldest episode"), T~ OBS))|>
  #discard any columns related to the join with duplicates database
  select(- (any_of(c(contains("4ab_2nd_ep"), contains("_4ab_1st_ep"))))) |> 
  select(-any_of(setdiff(colnames(mod_4ab), "hash_key")))|> 
  #;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;
  filter(!(rn %in% c(row_2b1_discard_1st_tr[!is.na(row_2b1_discard_1st_tr)], row_2b2_discard_1st_tr[!is.na(row_2b2_discard_1st_tr)])))|>
  mutate(OBS= case_when(rn %in% row_2b1_keep_2nd_tr[!is.na(row_2b1_keep_2nd_tr)]~ paste0(as.character(OBS),";","4.2_2b1.Different center ID, earliest tr. financed by SENDA, kept the earliest"), T~ OBS))|>
  mutate(OBS= case_when(rn %in% row_2b2_keep_2nd_tr[!is.na(row_2b2_keep_2nd_tr)]~ paste0(as.character(OBS),";","4.2_2b2.Different center ID, earliest tr. not financed by SENDA, kept the oldest"), T~ OBS))|> 
  #;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;
  filter(!(rn %in% row_35b_discard_shortest[!is.na(row_35b_discard_shortest)]))|>
  mutate(OBS= case_when(rn %in% row_35b_keep_largest[!is.na(row_35b_keep_largest)]~ paste0(as.character(OBS),";","4.35b.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset not earlier, kept the lagest"), T~ OBS))|>
  #;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;
   # row_35a_keep_check_after  row_35a_disc_check_after =need to check tr. duration. if >1045 and greater, then cut this treatment to the correct treatment wihtin
  #;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;
  #replace_disch_date_35a22  replace_disch_date_35a21 
  #row_35a_disc_check_after  row_35a_keep_check_after #4.35a. different center, both financed/not financed by SENDA, one episode in the middle of the other, yearly retrieval date was the earliest
  mutate(OBS= case_when(rn %in% setdiff(row_35a_keep_check_after, replace_disch_date_35a22$rn_2[!is.na(replace_disch_date_35a22$rn_2)])~ paste0(as.character(OBS),";","4.35a1.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset earliest, less 1094 days in tr., check afterwards"), T~ OBS))|>
  mutate(OBS= case_when(rn %in% setdiff(row_35a_disc_check_after, replace_disch_date_35a22$rn_2[!is.na(replace_disch_date_35a22$rn_2)])~ paste0(as.character(OBS),";","4.35a1.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset not earliest, less 1094 days in tr., check afterwards"), T~ OBS))|>
  mutate(OBS= case_when(rn %in% setdiff(row_35a_keep_check_after, replace_disch_date_35a22$rn_1[!is.na(replace_disch_date_35a21$rn_1)])~ paste0(as.character(OBS),";","4.35a1.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset earliest, less 1094 days in tr., check afterwards"), T~ OBS))|>
  mutate(OBS= case_when(rn %in% setdiff(row_35a_disc_check_after, replace_disch_date_35a22$rn_1[!is.na(replace_disch_date_35a21$rn_1)])~ paste0(as.character(OBS),";","4.35a1.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset not earliest, less 1094 days in tr., check afterwards"), T~ OBS))|>
  #;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;
  # for the cases with >1094 days in tr.
  left_join(replace_disch_date_35a21, by=c("rn"="rn_1"), suffix= c("","_35a21"))|> 
  left_join(replace_disch_date_35a22, by=c("rn"="rn_2"), suffix= c("","_35a22"))|>
  mutate(disch_date_num_rec4= case_when(!is.na(disch_date_num_rec_35a21)~ disch_date_num_rec_35a21, T~ disch_date_num_rec3))|>
  mutate(disch_date_num_rec4= case_when(!is.na(disch_date_num_rec_35a22)~ disch_date_num_rec_35a22, T~ disch_date_num_rec4))|>
  mutate(OBS= case_when(rn %in% c(replace_disch_date_35a21$rn_1, replace_disch_date_35a22$rn_2)~ paste0(as.character(OBS),";","4.35a2.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset not earliest, more 1094 days in tr., substract days in tr."), T~ OBS))|>
  select(-disch_date_num_rec_35a21, -disch_date_num_rec_35a22)|>
  #;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;
  left_join(mod_2_4ab, by=c("rn"="rn_1"), suffix=c("","_2_4ab_1st_ep"))|> 
  left_join(mod_2_4ab, by=c("rn"="rn_2"), suffix=c("","_2_4ab_2nd_ep"))|> 
  #Rows: 150,081
  #given it is paired with row numbers. The matching rownumbers will be those paired
  mutate(disch_date_num_rec5= case_when(oldest=="oldest_1"~ disch_date_num_rec_1, T~ disch_date_num_rec4))|>
  #because the second episode matched with the row number of the second column
  mutate(disch_date_num_rec5= case_when(oldest_2_4ab_2nd_ep=="oldest_2"~ disch_date_num_rec_2_2_4ab_2nd_ep, T~ disch_date_num_rec5))|> 
  # change cause of discharge to referral
  mutate(tr_compliance_rec= case_when(oldest=="oldest_1"~ "referral", T~ tr_compliance_rec))|>
  mutate(tr_compliance_rec= case_when(oldest_2_4ab_2nd_ep=="oldest_2"~ "referral", T~ tr_compliance_rec))|>
  # add observations to rows.
  mutate(OBS= case_when(oldest== "oldest_1" & grepl("^2_4b", crit_2_4ab)~ paste0(as.character(OBS),";","4.2_4b.Different center ID, same SENDA financing status, subtracted days to oldest episode and change discharge cause to referral"), T~ OBS))|>
  mutate(OBS= case_when(oldest== "oldest_1" & grepl("^2_4a", crit_2_4ab)~ paste0(as.character(OBS),";","4.2_4a.Differnt center ID, same SENDA financing status, subtracted days to oldest episode"), T~ OBS))|>
  mutate(OBS= case_when(oldest== "oldest_2" & grepl("^2_4b", crit_2_4ab_2_4ab_2nd_ep)~ paste0(as.character(OBS),";","4.2_4b.Different center ID, same SENDA financing status, subtracted days to oldest episode and change discharge cause to referral"), T~ OBS))|>
  mutate(OBS= case_when(oldest== "oldest_2" & grepl("^2_4a", crit_2_4ab_2_4ab_2nd_ep)~ paste0(as.character(OBS),";","4.2_4a.Different center ID, same SENDA financing status, subtracted days to oldest episode"), T~ OBS))|>
  #discard any columns related to the join with duplicates database
  select(- (any_of(c(contains("2_4ab_2nd_ep"), contains("2_4ab_1st_ep")))))|> 
  select(-any_of(setdiff(colnames(mod_2_4ab), "hash_key")))|> 
  #;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;
  mutate(disch_date_num_rec5= disch_date_num_rec5, dit_rec5= disch_date_num_rec5- adm_date_rec_num, disch_date_rec5= as.Date(disch_date_num_rec5, "1970-01-01"))|> 
  #;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;
        (\(df) {
    cat(paste0("4. Database before apply rules based on center ID, SENDA financing status, referral cause and treatment length, cases: ", formatC(nrow(df), big.mark=",")),"\n")
    cat(paste0("4. Database before apply rules based on center ID, SENDA financing status, referral cause and treatment length, RUNs: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
    if (nrow(df) > nrow(SISTRAT23_c1_2010_2022_df_prev1k))stop("Error: Added treatment episodes in the process")
    df
  })()
# 4. Database before apply rules based on center ID, SENDA financing status, referral cause and treatment length, cases: 150,181 
# 4. Database before apply rules based on center ID, SENDA financing status, referral cause and treatment length, RUNs: 106,283 
# 4. Database before apply rules based on center ID, SENDA financing status, referral cause and treatment length, cases: 150,077 
# 4. Database before apply rules based on center ID, SENDA financing status, referral cause and treatment length, RUNs: 106,283 

#4.4a. same center, both financed/not financed by SENDA, oldest episode had referral (2nd), cases // 4.4b. same center, both financed/not financed by SENDA, oldest episode w/o referral (2nd), cases:
4. Database before apply rules based on center ID, SENDA financing status, referral cause and treatment length, cases: 150,181 
4. Database before apply rules based on center ID, SENDA financing status, referral cause and treatment length, RUNs: 106,283 
4. Database before apply rules based on center ID, SENDA financing status, referral cause and treatment length, cases: 150,077 
4. Database before apply rules based on center ID, SENDA financing status, referral cause and treatment length, RUNs: 106,283 

The generated database is called SISTRAT23_c1_2010_2022_df_prev1l. We included the variables disch_date_rec, dit_rec5, and disch_date_num_rec5, as the result of replacing values for the deduplication process.


Probabilistic Matches

We selected matches in the rows with the same hash, treatment center ID, date of admission, type of plan, educational attainment, sex and ID, within ages and with a match score greater than or equal to 70.

Code
library(fastLink)
Sys.setenv(OMP_NUM_THREADS = parallel::detectCores()- 1) 

cores_min_1 <- parallel::detectCores()- 1

data <- as_tidytable(mutate(SISTRAT23_c1_2010_2022_df_prev1l, across(where(is.character), ~ifelse(is.na(.), "", .))) |> mutate(id_match = row_number()))

# Create a safe function to handle record linkage by block
process_block <- function(block_data) {
  # Skip if too few records
  if(nrow(block_data) <= 1) return(tidytable())
  
  # Create copies for comparison
  data1 <- data2 <- block_data
  
  # Identify columns for comparison
  varnames <- c("hash_key", "adm_date_rec_num", "disch_date_num_rec5", "plan_type", "id_centro", "ed_attainment", "sexo", "TABLE_rec")
  
  # Convert columns to character and handle NAs
  for(col in varnames) {
    if(col %in% names(data1)) {
      data1 <- data1|>  
        mutate({{ col }} := as.character(get(col)))|> 
        mutate({{ col }} := ifelse(is.na(get(col)), "", get(col)))
      
      data2 <- data2|> 
        mutate({{ col }} := as.character(get(col)))|> 
        mutate({{ col }} := ifelse(is.na(get(col)), "", get(col)))
    } else {
      data1 <- data1|>  mutate({{ col }} := "")
      data2 <- data2|>  mutate({{ col }} := "")
    }
  }
  
  # Use tryCatch to handle errors
  result <- tryCatch({
    # Run fastLink
    fl_out <- fastLink(
      dfA = data1,
      dfB = data2,
      varnames = setdiff(varnames, c("adm_date_rec_num", "disch_date_num_rec5")),
      stringdist.match = setdiff(varnames, c("adm_date_rec_num", "disch_date_num_rec5")),
      threshold.match = 0.9,
      n.cores = cores_min_1
    )
    
    # Extract matches
    if(length(fl_out$matches$inds.a) > 0) {
      tidytable(
        id_match_1 = data1$id_match[fl_out$matches$inds.a],
        id_match_2 = data2$id_match[fl_out$matches$inds.b],
        disch_date_num_rec5_1 = data1$disch_date_num_rec5[fl_out$matches$inds.a],
        disch_date_num_rec5_2 = data2$disch_date_num_rec5[fl_out$matches$inds.b],
        adm_date_rec_num_1 = data1$adm_date_rec_num[fl_out$matches$inds.a],
        adm_date_rec_num_2 = data2$adm_date_rec_num[fl_out$matches$inds.b],       
        match_score = fl_out$posterior
      )|> 
      filter(id_match_1 < id_match_2)|>  # Remove self-matches
      mutate(disch_date_num_rec5_1= ifelse(disch_date_num_rec5_1=="", 19475, as.numeric(disch_date_num_rec5_1)))|> 
      mutate(disch_date_num_rec5_2= ifelse(disch_date_num_rec5_2=="", 19475, as.numeric(disch_date_num_rec5_2)))|> 
      mutate(adm_date_rec_num_1= as.numeric(adm_date_rec_num_1))|> 
      mutate(adm_date_rec_num_2= as.numeric(adm_date_rec_num_2))|> 
      mutate(overlap= case_when(
        adm_date_rec_num_1 < disch_date_num_rec5_2 & # x Admitted before being admitted into another treatment
         disch_date_num_rec5_1 > adm_date_rec_num_2~ 1,T~0))# x Discharged after being discharged from another treatment 
    } else {
      tidytable()
    }
  }, error = function(e) {
    cat("Error in block:", unique(block_data$yr_block), "\n")
    print(e)
    tidytable()
  })
  
  return(result)
}

# Process by blocks
all_matches <- tidytable()
blocks <- unique(data$yr_block)

for(b in blocks) {
  cat("Processing block:", b, "\n")
  block_data <- data %>% filter(yr_block == b)
  block_matches <- process_block(block_data)
  
  if(nrow(block_matches) > 0) {
    all_matches <- bind_rows(all_matches, block_matches)
  }
}

# Join with original data
final_results <- data|>
  left_join(all_matches, by = c("id_match" = "id_match_1"))|>
  mutate(match_score= sprintf("%1.2f", match_score))
#rio::export(final_results,"E:/Mi unidad/Alvacast/SISTRAT 2023/data/20241015_out/final_results.rds")
Code
final_results_overlap <-
final_results|> filter(overlap==1)|> mutate(comb_hash_adm_date= paste0(hash_key,"_",adm_date_rec))|> 
  select(any_of(c("comb_hash_adm_date", "adm_date_rec_num", "disch_date_num_rec5", "plan_type", "id_centro", "ed_attainment", "sexo", "TABLE_rec", "id_match", "id_match_2", "disch_date_num_rec5_1", "disch_date_num_rec5_2", "adm_date_rec_num_1", "adm_date_rec_num_2", "match_score")))

cat(paste0("Overlaps w/ >.90 match score: ", formatC(nrow(final_results_overlap), big.mark=",")),"\n")

This approach has proven not useful, so we went back to the straight detection of overlappings.

0.c. Resolution of most problematic cases and multiple overlaps

We apply the rules to detect for overlaps again. We used the SISTRAT23_c1_2010_2022_df_prev1l dataset, which is the one that has been cleaned of duplicates and has the new variables.

Code
CONS_C1_df_dup_intervals_after_miss_less30d_0d_center_id<- 
  SISTRAT23_c1_2010_2022_df_prev1l|>
    mutate(disch_date_num_miss= ifelse(is.na(disch_date_num_rec5), 19475, disch_date_num_rec5))|> #equivalente a 2023-04-28 as.numeric(as.Date("2023-01-01"))
    rename("hash_key_2"="hash_key", "rn2"="rn")|>
    select(rn2, hash_key_2, TABLE, adm_age_rec, adm_date_rec, adm_date_rec_num , disch_date_rec0, disch_date_num_miss, dit_rec5, id_centro, tr_compliance_rec, plan_type, senda)|> 
    #dplyr::filter(motivodeegreso!="Derivación")|>
    data.table::as.data.table()
  
overlap_dates_C1_after_miss_less30d_0d_center_id <- janitor::clean_names(
    sqldf::sqldf(
      "
      SELECT *
      FROM CONS_C1_df_dup_intervals_after_miss_less30d_0d_center_id AS x
      INNER JOIN CONS_C1_df_dup_intervals_after_miss_less30d_0d_center_id AS y 
      ON x.hash_key_2 = y.hash_key_2 
         AND x.rn2 < y.rn2  -- Avoids duplicates (eg.: x vs y and then y vs x)
         AND x.adm_date_rec_num < y.disch_date_num_miss  -- x Admitted before being admitted into another treatment
         AND x.disch_date_num_miss > y.adm_date_rec_num  -- x Discharged after being admitted in other
         "
    ))|>
    `colnames<-`(c("rn_1", "hash_key_1", "ano_bd_1", "adm_age_1", "adm_date_1", "adm_date_rec_num_1", "disch_date_1", "disch_date_num_1", "dit_1", "id_centro_1", "tr_compliance_1", "plan_type_1", "senda_1",  "rn_2", "hash_key_2", "ano_bd_2", "adm_age_2", "adm_date_2", "adm_date_rec_num_2", "disch_date_2", "disch_date_num_2", "dit_2", "id_centro_2", "tr_compliance_2", "plan_type_2", "senda_2")) 
  
    cat(paste0("Number of overlapped dates, observations: ", nrow(overlap_dates_C1_after_miss_less30d_0d_center_id)),"\n")
    cat(paste0("Number of overlapped dates, RUNs: ", nrow(distinct(overlap_dates_C1_after_miss_less30d_0d_center_id, hash_key_1))))
    #Number of overlapped dates, observations: 306 # 266 # 263 june 2025
    #Number of overlapped dates, RUNs: 170 # 156 # 154 june 2025

#The rows on the left originate from older databases.
CONS_C1_df_dup_intervals_after_miss_less30d_0d_center_id <- 
as_tidytable(overlap_dates_C1_after_miss_less30d_0d_center_id)|>
  mutate(pair_id= paste0(rn_1,"_",rn_2))|> 
  mutate(same_id=ifelse(id_centro_1==id_centro_2,1,0))|>
  mutate(bd_2_earlier=ifelse(ano_bd_2>ano_bd_1,1,0))|> #es el dato de la derecha de una base de datos mas reciente.
  mutate(senda_status= case_when(senda_1=="si" & senda_2=="si"~ "both yes", senda_1=="no" & senda_2=="no"~ "both no", senda_1=="no" & senda_2=="si"~ "second yes", senda_1=="no" & senda_2=="no"~ "second no",  T~NA_character_))|>
  mutate(referral= ifelse(tr_compliance_1=="referral",1,0))|>
  mutate(days_overlapped=disch_date_num_1-adm_date_rec_num_2)|> # para que hayan dias positivos. Se supone que la fecha de egreso es más reciente que la fecha de ingreso del evento que superpone.
  mutate(more_dit=ifelse(dit_2>dit_1,1,0))|> #más días tratado en 2
  mutate(trat_1_within_2=ifelse(disch_date_num_1<disch_date_num_2 & adm_date_rec_num_1>adm_date_rec_num_2,1,0))|>
  mutate(trat_2_within_1=ifelse(disch_date_num_2<disch_date_num_1 & adm_date_rec_num_2>adm_date_rec_num_1,1,0))|>
  select(-hash_key_2) |> 
  rename("hash_key"="hash_key_1")

warning("2025-04-09: The conditions now should be that the row number is present in the Excel file and also in the rows vector where more than one overlap was detected. Otherwise, outdated cases will be corrected, which, due to the correction of the truncation date in the 2019 database, are no longer valid as overlaps.")

Warning: 2025-04-09: The conditions now should be that the row number is present in the Excel file and also in the rows vector where more than one overlap was detected. Otherwise, outdated cases will be corrected, which, due to the correction of the truncation date in the 2019 database, are no longer valid as overlaps.

Number of overlapped dates, observations: 263 
Number of overlapped dates, RUNs: 154
Code
cat("Explore whether there are more than one overlapping treatment episodes within the same center ID, and if so, how many times it occurs, after replacing center ID and previous steps in overlappings.\n")
overlaps_after_miss_appear_more_than_one_time_post_center_id<-
CONS_C1_df_dup_intervals_after_miss_less30d_0d_center_id|>
    tidytable::pivot_longer(
        cols = matches("_[12]$"),  # All columns ending with _1 or _2
        names_to = c(".value", "wave"),
        names_pattern = "(.+)_([12])",
        values_drop_na = FALSE) |> 
    group_by(rn) |> 
    count() |> 
filter(n>1) |> pull(rn)

cat("Have they changed?")
!identical(overlaps_after_miss_appear_more_than_one_time, overlaps_after_miss_appear_more_than_one_time_post_center_id)

cat(paste0("Number of overlaps after replacing center ID, episodes: ", formatC(length(overlaps_after_miss_appear_more_than_one_time_post_center_id), big.mark=",")),"\n")

cat(paste0("Number of overlapping combinations after replacing center IDs: ", formatC(nrow(CONS_C1_df_dup_intervals_after_miss_less30d_0d_center_id|> filter((rn_1 %in% overlaps_after_miss_appear_more_than_one_time | rn_2 %in% overlaps_after_miss_appear_more_than_one_time))), big.mark=",")),"\n")

#Number of overlaps after replacing center ID, episodes: 106 # 105 june 2025
#Number of overlapping combinations after replacing center IDs: 176 # 174 june 2025 

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
# CONS_C1_df_dup_intervals_after_miss_less30d_0d_center_id|> 
#   filter((rn_1 %in% overlaps_after_miss_appear_more_than_one_time | 
#          rn_2 %in% overlaps_after_miss_appear_more_than_one_time))|>
#    filter((rn_1 %in% overlaps_after_miss_appear_more_than_one_time_post_center_id | 
#          rn_2 %in% overlaps_after_miss_appear_more_than_one_time_post_center_id))

CONS_C1_df_dup_intervals_after_miss_less30d_0d_center_id|> 
  filter((rn_1 %in% overlaps_after_miss_appear_more_than_one_time | 
         rn_2 %in% overlaps_after_miss_appear_more_than_one_time))|>
          (\(df) {
    cat(paste0("More than one overlapping, cases: ", formatC(nrow(df), big.mark=",")),"\n")
    cat(paste0("More than one overlapping, RUNs: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
    #export HASHes to study them
    distinct(df, hash_key)|>  pull(hash_key) ->> hash_multiple_overlaps_after_center_id
  })()
# More than one overlapping, cases: 176 # 174 june 2025
# More than one overlapping, RUNs: 68 #67 june 2025

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:

more_one_overlap_after_center_id<-
  SISTRAT23_c1_2010_2022_df_prev1l|>
    filter(hash_key %in% as.character(hash_multiple_overlaps_after_center_id))|> 
    mutate(disch_date_num_miss= ifelse(is.na(disch_date_num_rec5), 19475, disch_date_num_rec5))|> #equivalent to 2023-04-28 as.numeric(as.Date("2023-01-01"))
    select(hash_key, rn, TABLE_rec, adm_age_rec, senda_adm_date, adm_date_rec, adm_date_rec_num , disch_date_rec5, disch_date_num_miss, dit_rec5, id_centro, tr_compliance, plan_type, senda)|> 
      mutate(hash_key=factor(hash_key))


invisible("To check problematic cases\n")
if(length(ls()[grepl("no_mostrar", ls())])>0){
  
  SISTRAT23_c1_2010_2022_df_prev1l|>
  filter(hash_key=="0d3452833c9825ed178e4aea8da2bd30f86b1e5e1839fdc57e7e446105bcedde")|> 
    mutate(disch_date_num_miss= ifelse(is.na(disch_date_num_rec5), 19475, disch_date_num_rec5))|> #equivalente a 2023-04-28 as.numeric(as.Date("2023-01-01"))
    select(rn, hash_key, TABLE_rec, adm_age_rec, senda_adm_date, adm_date_rec, adm_date_rec_num , disch_date, disch_date_num_miss, dit_rec5, id_centro, tr_compliance, plan_type, senda, OBS) |> 
    glimpse()

  }
("To explore what are the rows that enter in conflict, to help us in the analysis of overlapings\n")
if(length(ls()[grepl("no_mostrar", ls())])>0){
    CONS_C1_df_dup_intervals_after_miss_less30d_0d_center_id|>
      filter(rn_1==76076|rn_2==76076) |> select(rn_1, rn_2, adm_date_1, disch_date_1, adm_date_2, disch_date_2)
    
    opc <- c(10716,
  2678,
  5505)
    SISTRAT23_c1_2010_2022_df_prev1l|> 
      filter(rn %in% opc)|> 
      select(OBS)
    CONS_C1_df_dup_intervals_after_miss_less30d_0d_center_id|> 
      filter((rn_1 %in% opc | 
             rn_2 %in% opc))|>
      select(rn_1, rn_2, adm_date_1, disch_date_1, adm_date_2, disch_date_2, hash_key)
    #3e97be604b540841225cf7948ed4e822c969cba6c5b6484c916e0bb109cd38e4
}

cat("We export the dataset with more than one overlap to check it manually.\n")
result_more_one_overlap_after_center_id <- aggregate(rn ~ hash_key, data = more_one_overlap_after_center_id|>
  mutate(hash_key=as.numeric(hash_key)), FUN = function(x) paste(x, collapse = ","))

more_one_overlap_after_center_id|>
  mutate(hash_key=as.numeric(hash_key))|> 
    rio::export(paste0(wdpath, "cons/_out/more_one_overlaps_after_center_id.xlsx"))
Explore whether there are more than one overlapping treatment episodes within the same center ID, and if so, how many times it occurs, after replacing center ID and previous steps in overlappings.
Have they changed?[1] FALSE
Number of overlaps after replacing center ID, episodes: 105 
Number of overlapping combinations after replacing center IDs: 174 
More than one overlapping, cases: 174 
More than one overlapping, RUNs: 67 
[1] "To explore what are the rows that enter in conflict, to help us in the analysis of overlapings\n"
We export the dataset with more than one overlap to check it manually.

Summary of Manual Data Cleaning for Overlapping Treatments

Manual adjustments were made to resolve overlapping treatment episodes based on the following criteria:

  • Prioritized SENDA Admission Date (adm_date_senda): Used adm_date_senda over adm_date to resolve overlaps, especially for treatments >1094 days or from pre-2012 databases.
  • Handled Multiple Ongoing Treatments: Retained the most recent ongoing treatment; adjusted the previous discharge date to one day before the next admission.
  • Prioritized Recent Data: When overlaps occurred between records from different database years, the record from the most recent year was kept.
  • Managed Missing Discharge Dates: Replaced missing discharge dates with the subsequent admission date minus one day, if applicable.
  • Removed Unreliable Long Treatments: Eliminated treatments >1094 days if they lacked SENDA funding or originated from pre-2012 databases without a discharge date.
  • Addressed Short Overlaps (<15 days): Considered these minor discrepancies, likely due to administrative delays, and resolved by retaining the most plausible record.
  • Handled Referrals: Prioritized referrals from more recent databases in case of overlaps. If one treatment absorbed others (especially if SENDA-funded and recent), only the absorbing record was kept.
  • Noted Truncated 2019 Data: Acknowledged that treatments recorded in the 2019 database might be truncated as of Nov 13, 2019, using dias_en_tratamiento for duration calculations.
  • Flagged Ongoing Status: Marked treatments listed as “ongoing” for future status updates.

Changes were in the following variables: adm_date_corrected, disch_date_rec5_corrected and tr_compliance_rec.

Code
multiple_overlaps_manual_correction<-
rio::import(paste0(wdpath, "cons/_out/more_one_overlaps_after_center_id_mod.xlsx"), sheet = "Hoja 1")

cat("Aggregated by rows so we can pair the previous and the updated manual correciton by rownumbers")
result_multiple_overlaps_manual_correction <- aggregate(rn ~ hash_key, data = multiple_overlaps_manual_correction, FUN = function(x) paste(x, collapse = ","))

cat("How many records were in the Excel file vs. this new one?\n")
if(more_one_overlap_after_center_id|>
  mutate(hash_key=as.numeric(hash_key))|> 
  filter(!rn %in% multiple_overlaps_manual_correction$rn) |> nrow()>0){
  stop("There are records in the new Excel file that are not in the old one")
} else {print(0)}

#comparison by row numbers combined contrasted
if(
  result_more_one_overlap_after_center_id|>
    left_join(result_multiple_overlaps_manual_correction, by="rn")|> 
  (\(df) {
    colnames(df)<- c("actual_hash_key", "rownumbers_combined", "previous_review_hashkey")
    df
  })() |> 
    filter(is.na(actual_hash_key))|> nrow()>0){
  stop("There are records in the new Excel file that are not in the old one")
}else {print(0)}


cat("I need to find hash_keys that share the same rownumbers")
cat("This means that i only need to update the combiniton of rownumbers that are needed only")

result_more_one_overlap_after_center_id_updated <- 
result_more_one_overlap_after_center_id|>
   left_join(result_multiple_overlaps_manual_correction, by="rn")|>
   (\(df) {
       colnames(df)<- c("actual_hash_key", "rownumbers_combined", "previous_review_hashkey")
       df
   })()|> 
  left_join(multiple_overlaps_manual_correction, by= c("previous_review_hashkey"="hash_key"))|> 
   (\(df) {
     if(nrow(df)!= more_one_overlap_after_center_id|> nrow()){
       stop("Different rows between the updated and the matched with the manual correction")}
     df
   })()|> 
  #obtain real encripted RUNs
  left_join((mutate(more_one_overlap_after_center_id, hash_key_num=as.numeric(hash_key))[, c("hash_key", "hash_key_num")]|> distinct(hash_key, .keep_all=T)), by= c("actual_hash_key"="hash_key_num"))|>
   (\(df) {  
     #check if there are updates in the criteria and information to judge overlappings
  if(group_by(df, previous_review_hashkey)|> filter(any(rn %in% hashs_dates_updated_disch_date$rny))|> ungroup()|> nrow()>0){ warning("Updated discharge dates of 2019 are still being discussed")}
  if(group_by(df, previous_review_hashkey)|> filter(any(rn %in% rows_truncated_treatments_due_to_retrieval_2019))|> ungroup()|> nrow()>0){ warning("Missing discharge dates of pre-0 are still being discussed")}
     df
   })()

Warning in (function(df) {: Missing discharge dates of pre-0 are still being discussed

Code
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
result_more_one_overlap_after_center_id_updated|>
  filter(OBS == "eliminar")|>
  (\(df) {
    cat(paste0("4.0c.1.Delete tr. episodes, multiple overlappings, cases: ", formatC(nrow(df), big.mark=",")),"\n")
    cat(paste0("4.0c.1.Delete tr. episodes, multiple overlappings, RUNs: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
    df|> 
      pull(rn) ->> row_40c_delete_tr_episodes
    df|> 
      left_join(more_one_overlap_after_center_id[,c("rn","hash_key")], by = "rn", multiple = "first")|>
      pull(hash_key.y)|> as.character() ->> hashes_40c_delete_tr_episodes
  })()
# 4.0c.1.Delete tr. episodes, multiple overlappings, cases: 35 
# 4.0c.1.Delete tr. episodes, multiple overlappings, RUNs: 28 

#june 2025
# 4.0c.1.Delete tr. episodes, multiple overlappings, cases: 31 
# 4.0c.1.Delete tr. episodes, multiple overlappings, RUNs: 24

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
result_more_one_overlap_after_center_id_updated|>
  filter(!is.na(disch_date_rec5_corrected))|>
  (\(df) {
    cat(paste0("4.0c.2.Replace discharge dates, multiple overlappings, cases: ", formatC(nrow(df), big.mark=",")),"\n")
    cat(paste0("4.0c.2.Replace discharge dates, multiple overlappings, RUNs: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
    df|> 
      select(rn, disch_date_rec5_corrected)|> 
      mutate(disch_date_rec5_corrected= as.Date(as.character(disch_date_rec5_corrected))) ->> row_40c_replace_disch_dates
  })()
# 4.0c.2.Replace discharge dates, multiple overlappings, cases: 62 
# 4.0c.2.Replace discharge dates, multiple overlappings, RUNs: 44 

#june 2025
# 4.0c.2.Replace discharge dates, multiple overlappings, cases: 56 
# 4.0c.2.Replace discharge dates, multiple overlappings, RUNs: 38 

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:

result_more_one_overlap_after_center_id_updated|>
  filter(!is.na(adm_date_corrected))|>
  (\(df) {
    cat(paste0("4.0c.3.Replace admission date, multiple overlappings, cases: ", formatC(nrow(df), big.mark=",")),"\n")
    cat(paste0("4.0c.3.Replace admission date, multiple overlappings, RUNs: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
    df|> 
      select(rn, adm_date_corrected)|> 
      mutate(adm_date_corrected= as.Date(as.character(adm_date_corrected))) ->> row_40c_replace_adm_dates
  })()
# 4.0c.3.Replace admission date, multiple overlappings, cases: 42 
# 4.0c.3.Replace admission date, multiple overlappings, RUNs: 33 

#june2025
# 4.0c.3.Replace admission date, multiple overlappings, cases: 42 
# 4.0c.3.Replace admission date, multiple overlappings, RUNs: 33 

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:

result_more_one_overlap_after_center_id_updated|>
  filter(!is.na(tr_compliance_rec))|>
  (\(df) {
    cat(paste0("4.0c.4.Replace referral cause, multiple overlappings, cases: ", formatC(nrow(df), big.mark=",")),"\n")
    cat(paste0("4.0c.4.Replace referral cause, multiple overlappings, RUNs: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
    df|> 
      select(rn, tr_compliance_rec)|> 
      mutate(tr_compliance_rec= as.character(tr_compliance_rec)) ->> row_40c_replace_referral
  })()
# 4.0c.4.Replace referral cause, multiple overlappings, cases: 5 
# 4.0c.4.Replace referral cause, multiple overlappings, RUNs: 5 

# june2025
# 4.0c.4.Replace referral cause, multiple overlappings, cases: 2 
# 4.0c.4.Replace referral cause, multiple overlappings, RUNs: 2 

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
invisible("(________________________________________________)")
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:

SISTRAT23_c1_2010_2022_df_prev1m<-
  SISTRAT23_c1_2010_2022_df_prev1l|>
      (\(df) {
    cat(paste0("4.0c. Database before apply rules based on multiple overlappings, cases: ", formatC(nrow(df), big.mark=",")),"\n")
    cat(paste0("4.0c. Database before apply rules based on multiple overlappings, RUNs: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
    df
  })()|> 
  #:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
  filter(!(rn %in% row_40c_delete_tr_episodes))|>
  mutate(OBS= case_when(hash_key %in% hashes_40c_delete_tr_episodes~ paste0(as.character(OBS),";","4.0c.1.Multiple overlappings, discarded tr. episodes"), T~ OBS))|> 
  #;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;
  #;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;
  left_join(row_40c_replace_adm_dates, by="rn", suffix=c("","_40c3"))|> 
  left_join(row_40c_replace_disch_dates, by="rn", suffix=c("","_40c2"))|> 
  left_join(row_40c_replace_referral, by="rn", suffix=c("","_40c4"))|> 
  mutate(OBS= case_when(!is.na(adm_date_corrected)~ paste0(as.character(OBS),";","4.0c.3.Multiple overlappings, replace admission dates"), T~ OBS))|> 
  mutate(OBS= case_when(!is.na(disch_date_rec5_corrected)~ paste0(as.character(OBS),";","4.0c.2.Multiple overlappings, replace discharge dates"), T~ OBS))|> 
  mutate(OBS= case_when(!is.na(tr_compliance_rec_40c4)~ paste0(as.character(OBS),";","4.0c.4.Multiple overlappings, replace cause of discharge as referral"), T~ OBS))|> 
  #;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;
  mutate(adm_date_rec2= case_when(!is.na(adm_date_corrected)~ adm_date_corrected, T~ adm_date_rec))|>
  mutate(adm_date_num_rec2= case_when(!is.na(adm_date_corrected)~ as.numeric(adm_date_rec2), T~ as.numeric(adm_date_rec_num)))|>   
  mutate(disch_date_num_rec6= case_when(!is.na(disch_date_rec5_corrected)~ as.numeric(disch_date_rec5_corrected), T~ disch_date_num_rec5))|> 
  mutate(disch_date_rec6= case_when(!is.na(disch_date_rec5_corrected)~ disch_date_rec5_corrected, T~ disch_date_rec5))|> 
  mutate(tr_compliance_rec2= case_when(!is.na(tr_compliance_rec_40c4)~ tr_compliance_rec_40c4, T~ tr_compliance_rec))|> 
  mutate(adm_age_rec2=round(as.numeric((adm_date_rec2-birth_date_rec))/365.25,2))|>
  #discard any columns related to the join to the main database
  select(-(any_of(c("adm_date_corrected","disch_date_rec5_corrected"))))|> 
  select(-(contains("_40c4")))|> 
  #;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;
  #;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;
  mutate(dit_rec6= disch_date_num_rec5- adm_date_num_rec2)|>   
  #:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
  #Early vs. late dropout (2025-06-03, checked right)
  mutate(dit_earl_drop_rec= ifelse(dit_rec6<90 & !is.na(dit_rec6),1,0))|>
  mutate(dit_earl_drop= factor(dit_earl_drop_rec, labels=c(">= 90 days", "<90 days")))|> #t.test(dit_rec6~ dit_earl_drop_rec, data= SISTRAT23_c1_2010_2022_df_prev1m)
  #:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
  #Treatment compliance
  mutate(tr_compliance_rec3= case_when(grepl("<",dit_earl_drop) & grepl("drop", tr_compliance_rec2)~ "early dropout", grepl(">",dit_earl_drop) & grepl("drop", tr_compliance_rec2)~ "late dropout", grepl("<",dit_earl_drop) & grepl("adm dis", tr_compliance_rec2)~ "early adm discharge", grepl(">",dit_earl_drop) & grepl("adm dis", tr_compliance_rec2)~ "late adm discharge", grepl("completion",tr_compliance_rec2)~ "completion", grepl("death",tr_compliance_rec2)~ "death", grepl("referral",tr_compliance_rec2)~ "referral", grepl("adm tr",tr_compliance_rec2)~ "adm truncated", is.na(tr_compliance_rec2)~ "adm truncated", TRUE~ "currently in"))|>
  #table(SISTRAT23_c1_2010_2022_df_prev1m$dit_earl_drop_rec,SISTRAT23_c1_2010_2022_df_prev1m$dit_earl_drop))
  #filter(is.na(tr_compliance_rec2)) |> glimpse()
  #;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;
        (\(df) {
    cat(paste0("4.0c. Database after apply rules based on multiple overlappings, cases: ", formatC(nrow(df), big.mark=",")),"\n")
    cat(paste0("4.0c. Database after apply rules based on multiple overlappings, RUNs: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
    if (nrow(df) > nrow(SISTRAT23_c1_2010_2022_df_prev1l))stop("Error: Added treatment episodes in the process")
    df
  })()
# 4.0c. Database before apply rules based on multiple overlappings, cases: 150,076 
# 4.0c. Database before apply rules based on multiple overlappings, RUNs: 106,283 
# 4.0c. Database after apply rules based on multiple overlappings, cases: 150,041 
# 4.0c. Database after apply rules based on multiple overlappings, RUNs: 106,283 
Aggregated by rows so we can pair the previous and the updated manual correciton by rownumbersHow many records were in the Excel file vs. this new one?
[1] 0
[1] 0
I need to find hash_keys that share the same rownumbersThis means that i only need to update the combiniton of rownumbers that are needed only4.0c.1.Delete tr. episodes, multiple overlappings, cases: 31 
4.0c.1.Delete tr. episodes, multiple overlappings, RUNs: 24 
4.0c.2.Replace discharge dates, multiple overlappings, cases: 56 
4.0c.2.Replace discharge dates, multiple overlappings, RUNs: 38 
4.0c.3.Replace admission date, multiple overlappings, cases: 42 
4.0c.3.Replace admission date, multiple overlappings, RUNs: 33 
4.0c.4.Replace referral cause, multiple overlappings, cases: 2 
4.0c.4.Replace referral cause, multiple overlappings, RUNs: 2 
4.0c. Database before apply rules based on multiple overlappings, cases: 150,077 
4.0c. Database before apply rules based on multiple overlappings, RUNs: 106,283 
4.0c. Database after apply rules based on multiple overlappings, cases: 150,046 
4.0c. Database after apply rules based on multiple overlappings, RUNs: 106,283 

The database SISTRAT23_c1_2010_2022_df_prev1m was generated by replacing the original admission and discharge dates, along with the causes of discharge. Subsequently, the following variables were added to this dataset: the revised discharge date (disch_date_rec6), its numeric representation (disch_date_num_rec6), and the calculated days in treatment (dit_rec6). The revised admission date resulting from the replacement was also included in the final dataset in its numeric (adm_date_num_rec2) and date (adm_date_rec2) format. Also, we generated tr_compliance_rec3 to recode cause of discharge according to changes made in days in treatment and overlapping correction. We also added dit_earl_drop_rec, a binary classification of treatments with less than 90 days.

We check again if there are overlaps after manual replacements.

Code
CONS_C1_df_dup_intervals_after_miss_less30d_0d_center_id_multiple_overlaps<- 
  SISTRAT23_c1_2010_2022_df_prev1m|>
    mutate(disch_date_num_miss= ifelse(is.na(disch_date_num_rec6), 19475, disch_date_num_rec6))|> #equivalente a 2023-04-28 as.numeric(as.Date("2023-01-01"))
    rename("hash_key_2"="hash_key", "rn2"="rn")|>
    select(rn2, hash_key_2, TABLE, adm_age_rec2, adm_date_rec2, adm_date_num_rec2 , disch_date_rec6, disch_date_num_miss, dit_rec6, id_centro, tr_compliance_rec3, plan_type, senda)|> 
    #dplyr::filter(motivodeegreso!="Derivación")|>
    data.table::as.data.table()
  
overlap_dates_C1_after_miss_less30d_0d_center_id_mult_overlap <- janitor::clean_names(
    sqldf::sqldf(
      "
      SELECT *
      FROM CONS_C1_df_dup_intervals_after_miss_less30d_0d_center_id_multiple_overlaps AS x
      INNER JOIN CONS_C1_df_dup_intervals_after_miss_less30d_0d_center_id_multiple_overlaps AS y 
      ON x.hash_key_2 = y.hash_key_2 
         AND x.rn2 < y.rn2  -- Avoids duplicates (eg.: x vs y and then y vs x)
         AND x.adm_date_num_rec2 < y.disch_date_num_miss  -- x Admitted before being admitted into another treatment
         AND x.disch_date_num_miss > y.adm_date_num_rec2  -- x Discharged after being admitted in other
         "
    ))|>
    `colnames<-`(c("rn_1", "hash_key_1", "ano_bd_1", "adm_age_1", "adm_date_1", "adm_date_rec_num_1", "disch_date_1", "disch_date_num_1", "dit_1", "id_centro_1", "tr_compliance_1", "plan_type_1", "senda_1",  "rn_2", "hash_key_2", "ano_bd_2", "adm_age_2", "adm_date_2", "adm_date_rec_num_2", "disch_date_2", "disch_date_num_2", "dit_2", "id_centro_2", "tr_compliance_2", "plan_type_2", "senda_2")) 
  
    cat(paste0("Number of overlapped dates, observations: ", nrow(overlap_dates_C1_after_miss_less30d_0d_center_id_mult_overlap)),"\n")
    cat(paste0("Number of overlapped dates, RUNs: ", nrow(distinct(overlap_dates_C1_after_miss_less30d_0d_center_id_mult_overlap, hash_key_1))))
    #Number of overlapped dates, observations: 90 
    #Number of overlapped dates, RUNs: 89
# june 2025
    #Number of overlapped dates, observations: 89 
    #Number of overlapped dates, RUNs: 88

#The rows on the left originate from older databases.
CONS_C1_df_dup_intervals_after_miss_less30d_0d_center_id_multiple_overlaps <- 
as_tidytable(overlap_dates_C1_after_miss_less30d_0d_center_id_mult_overlap)|>
  mutate(pair_id= paste0(rn_1,"_",rn_2))|> 
  mutate(same_id=ifelse(id_centro_1==id_centro_2,1,0))|>
  mutate(bd_2_earlier=ifelse(ano_bd_2>ano_bd_1,1,0))|> #es el dato de la derecha de una base de datos mas reciente.
  mutate(senda_status= case_when(senda_1=="si" & senda_2=="si"~ "both yes", senda_1=="no" & senda_2=="no"~ "both no", senda_1=="no" & senda_2=="si"~ "second yes", senda_1=="no" & senda_2=="no"~ "second no",  T~NA_character_))|>
  mutate(referral= ifelse(tr_compliance_1=="referral",1,0))|>
  mutate(days_overlapped=disch_date_num_1-adm_date_rec_num_2)|> # para que hayan dias positivos. Se supone que la fecha de egreso es más reciente que la fecha de ingreso del evento que superpone.
  mutate(more_dit=ifelse(dit_2>dit_1,1,0))|> #más días tratado en 2
  mutate(trat_1_within_2=ifelse(disch_date_num_1<disch_date_num_2 & adm_date_rec_num_1>adm_date_rec_num_2,1,0))|>
  mutate(trat_2_within_1=ifelse(disch_date_num_2<disch_date_num_1 & adm_date_rec_num_2>adm_date_rec_num_1,1,0))|>
  select(-hash_key_2) |> 
  rename("hash_key"="hash_key_1")

warning("2025-04-09: The conditions now should be that the row number is present in the Excel file and also in the rows vector where more than one overlap was detected. Otherwise, outdated cases will be corrected, which, due to the correction of the truncation date in the 2019 database, are no longer valid as overlaps.")

Warning: 2025-04-09: The conditions now should be that the row number is present in the Excel file and also in the rows vector where more than one overlap was detected. Otherwise, outdated cases will be corrected, which, due to the correction of the truncation date in the 2019 database, are no longer valid as overlaps.

Code
warning("2025-06-02: This was corrected partially, as 2019 updated dates were used.")

Warning: 2025-06-02: This was corrected partially, as 2019 updated dates were used.

Number of overlapped dates, observations: 89 
Number of overlapped dates, RUNs: 88
Code
CONS_C1_df_dup_intervals_after_miss_less30d_0d_center_id_multiple_overlaps|> 
  left_join(SISTRAT23_c1_2010_2022_df_prev1m[, c("rn", "OBS")], by=c("rn_1"="rn"))|> 
left_join(SISTRAT23_c1_2010_2022_df_prev1m[, c("rn", "OBS")], by=c("rn_2"="rn"), suffix=c("","_2nd"))|> 
  (\(df) {
        mutate(df, hash_key= as.numeric(factor(hash_key)))|> rio::export("_out/_overlaps_dup_after_manual_imp.xlsx") #for visual comparison in excel
        knitr::kable(filter(df, hash_key %in% pull(sample_n_with_seed(CONS_C1_df_dup_intervals_after_miss_less30d_0d_center_id_multiple_overlaps,20, seed=2125),"hash_key"))|> mutate(hash_key= as.numeric(factor(hash_key))), format = "html", format.args = list(decimal.mark = ".", big.mark = ","), caption="Cases with overlapped treatment ranges (after correcting for missing discharge dates)", align = rep('c', 32))|> 
  kableExtra::kable_classic()|> 
  kableExtra::scroll_box(height = "400px")
    })()
Cases with overlapped treatment ranges (after correcting for missing discharge dates)
rn_1 hash_key ano_bd_1 adm_age_1 adm_date_1 adm_date_rec_num_1 disch_date_1 disch_date_num_1 dit_1 id_centro_1 tr_compliance_1 plan_type_1 senda_1 rn_2 ano_bd_2 adm_age_2 adm_date_2 adm_date_rec_num_2 disch_date_2 disch_date_num_2 dit_2 id_centro_2 tr_compliance_2 plan_type_2 senda_2 pair_id same_id bd_2_earlier senda_status referral days_overlapped more_dit trat_1_within_2 trat_2_within_1 OBS OBS_2nd
19,764 1 2011 26.30 2011-11-14 15,292 2011-12-23 15,331 39 182 early dropout pg-pai si 37,150 2013 25.23 2010-10-19 14,901 2013-09-13 15,961 1,060 408 referral pg-pab si 19764_37150 0 1 both yes 0 430 1 1 0 2.1.1.b.The most common date is selected as the birth date;4.35a1.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset not earliest, less 1094 days in tr., check afterwards;4.35a1.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset not earliest, less 1094 days in tr., check afterwards 2.1.1.b.The most common date is selected as the birth date;4.35a1.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset earliest, less 1094 days in tr., check afterwards;4.35a1.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset earliest, less 1094 days in tr., check afterwards
47,306 2 2013 48.45 2013-12-17 16,056 2014-01-29 16,099 43 328 referral pg-pab si 53,893 2014 47.53 2013-01-17 15,722 2014-08-27 16,309 587 502 completion pg-pai si 47306_53893 0 1 both yes 1 377 1 1 0 ;4.35a1.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset not earliest, less 1094 days in tr., check afterwards;4.35a1.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset not earliest, less 1094 days in tr., check afterwards ;4.35a1.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset earliest, less 1094 days in tr., check afterwards;4.35a1.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset earliest, less 1094 days in tr., check afterwards
45,670 3 2013 31.29 2013-10-21 15,999 2014-01-29 16,099 100 123 referral pg-pai si 53,680 2014 31.16 2013-09-02 15,950 2014-05-06 16,196 246 502 late dropout pg-pai si 45670_53680 0 1 both yes 1 149 1 1 0 ;4.35a1.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset not earliest, less 1094 days in tr., check afterwards;4.35a1.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset not earliest, less 1094 days in tr., check afterwards ;4.35a1.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset earliest, less 1094 days in tr., check afterwards;4.35a1.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset earliest, less 1094 days in tr., check afterwards
35,463 4 2013 29.22 2013-01-14 15,719 2013-02-04 15,740 21 258 early dropout pg-pr si 36,422 2013 27.65 2011-06-21 15,146 2013-05-31 15,856 710 238 referral pg-pab no 35463_36422 0 0 0 594 1 1 0
204,449 5 2021 34.63 2021-11-30 18,961 2022-02-01 19,024 63 238 early dropout pg-pai no 210,941 2022 34.64 2021-12-07 18,968 2022-02-01 19,024 56 258 early dropout pg-pr si 204449_210941 0 1 second yes 0 56 0 0 0 1.1. Duplicated Cases in Almost Every Variable
5,572 6 2010 22.10 2010-01-25 14,634 2011-06-02 15,127 493 109 referral pg-pab si 6,177 2010 22.45 2010-06-01 14,761 2010-12-15 14,958 197 117 late dropout pg-pr no 5572_6177 0 0 1 366 0 0 1 1c.b.3.cases w/different discharge dates, removed entries w/ lower dit; 1c.b.6.cases w/ same retrieval yrs and disch. dates, removed entries from previous retrieval yrs; ;2.1.1.c.Multiple common dates found. Select the birth date closest to available external records 2.1.1.c.Multiple common dates found. Select the birth date closest to available external records
5,014 7 2010 29.65 2009-04-27 14,361 2011-06-06 15,131 770 109 referral pg-pab si 10,410 2011 30.61 2010-04-12 14,711 2012-01-31 15,370 659 117 completion pg-pr no 5014_10410 0 1 1 420 0 0 0 1.1. Duplicated Cases in Almost Every Variable
336 8 2010 21.88 2007-07-17 13,711 2010-02-01 14,641 945 118 referral pg-pai si 12,792 2011 24.35 2010-01-04 14,613 2012-02-01 15,371 758 118 late adm discharge pg-pai si 336_12792 1 1 both yes 1 28 0 0 0 ;4.0c.2.Multiple overlappings, replace discharge dates
73,917 9 2015 20.94 2015-03-01 16,495 2015-07-27 16,643 148 148 late dropout pg-pai si 80,992 2015 21.20 2015-06-06 16,592 2016-02-01 16,832 240 146 late dropout pg-pr no 73917_80992 0 0 0 51 1 0 0 3.1. Collapsed Treatment Plans
72,706 10 2015 41.28 2015-02-05 16,471 2015-10-02 16,710 239 316 referral pg-pai si 78,199 2015 41.67 2015-06-25 16,611 2015-06-27 16,613 2 336 completion pg-pr no 72706_78199 0 0 1 99 0 0 1 3.1. Collapsed Treatment Plans
75,938 11 2015 40.73 2015-04-30 16,555 2015-06-10 16,596 41 609 referral pg-pab si 76,789 2015 40.71 2015-04-21 16,546 2015-06-30 16,616 70 146 referral pg-pr no 75938_76789 0 0 1 50 1 1 0 3.1. Collapsed Treatment Plans
103,198 12 2016 38.98 2016-10-11 17,085 2016-12-02 17,137 52 141 referral pg-pai no 109,664 2017 39.00 2016-10-20 17,094 2017-07-13 17,360 266 142 completion m-pr si 103198_109664 0 1 second yes 1 43 1 0 0 1.1. Duplicated Cases in Almost Every Variable
34,549 13 2013 20.94 2012-11-12 15,656 2013-02-04 15,740 84 142 early dropout m-pr si 36,225 2013 20.71 2012-08-18 15,570 2013-10-10 15,988 418 146 late dropout pg-pab no 34549_36225 0 0 0 170 1 1 0 1.1. Duplicated Cases in Almost Every Variable;2.1.1.b.The most common date is selected as the birth date 2.1.1.b.The most common date is selected as the birth date
6,300 14 2010 30.27 2010-05-27 14,756 2011-06-06 15,131 375 109 referral pg-pai si 11,408 2011 30.55 2010-09-08 14,860 2011-08-31 15,217 357 117 late dropout pg-pr no 6300_11408 0 1 1 271 0 0 0 1.1. Duplicated Cases in Almost Every Variable
18,837 15 2011 31.76 2011-09-01 15,218 2011-09-07 15,224 6 275 early dropout m-pr no 21,776 2012 31.61 2011-07-07 15,162 2012-12-26 15,700 538 291 referral pg-pai si 18837_21776 0 1 second yes 0 62 1 1 0 1.1. Duplicated Cases in Almost Every Variable
45,033 16 2013 56.21 2013-09-11 15,959 2014-01-29 16,099 140 123 referral pg-pai si 54,497 2014 56.17 2013-08-26 15,943 2014-08-20 16,302 359 502 completion pg-pai si 45033_54497 0 1 both yes 1 156 1 1 0 2.1.1.b.The most common date is selected as the birth date;4.35a1.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset not earliest, less 1094 days in tr., check afterwards;4.35a1.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset not earliest, less 1094 days in tr., check afterwards 2.1.1.b.The most common date is selected as the birth date;4.35a1.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset earliest, less 1094 days in tr., check afterwards;4.35a1.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset earliest, less 1094 days in tr., check afterwards
35,519 17 2013 27.99 2013-01-04 15,709 2013-05-28 15,853 144 154 referral pg-pr si 36,433 2013 27.99 2013-01-01 15,706 2013-08-27 15,944 238 330 referral pg-pr no 35519_36433 0 0 1 147 1 1 0 3.1. Collapsed Treatment Plans
44,506 18 2013 22.26 2013-09-06 15,954 2013-11-29 16,038 84 123 early adm discharge pg-pab si 53,864 2014 22.18 2013-08-08 15,925 2014-05-06 16,196 271 502 late dropout pg-pab si 44506_53864 0 1 both yes 0 113 1 1 0 ;4.35a1.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset not earliest, less 1094 days in tr., check afterwards;4.35a1.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset not earliest, less 1094 days in tr., check afterwards ;4.35a1.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset earliest, less 1094 days in tr., check afterwards;4.35a1.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset earliest, less 1094 days in tr., check afterwards
5,613 19 2010 23.87 2010-05-06 14,735 2010-08-26 14,847 112 275 referral m-pr si 23,153 2012 23.83 2010-04-19 14,718 2012-10-31 15,644 926 259 late dropout pg-pai si 5613_23153 0 1 both yes 1 129 1 1 0 ;4.35a1.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset not earliest, less 1094 days in tr., check afterwards;4.35a1.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset not earliest, less 1094 days in tr., check afterwards 1.1. Duplicated Cases in Almost Every Variable;1c.a.3.cases w/different discharge dates, removed entries w/ lower dit; 1c.a.6.cases w/ same retrieval yrs and disch. dates, removed entries from previous retrieval yrs; ;4.35a1.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset earliest, less 1094 days in tr., check afterwards;4.35a1.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset earliest, less 1094 days in tr., check afterwards
25,303 20 2012 18.79 2012-03-19 15,418 2019-12-31 18,261 2,843 adm truncated pg-pai no 29,037 2012 19.21 2012-08-20 15,572 2013-01-15 15,720 148 referral pg-pr no 25303_29037 0 both no 0 2,689 0 0 1 2.1.1.a.Less16|More90, removed rows due to >2 |diff| ;2.1.1.c.Multiple common dates found. Select the birth date closest to available external records; 4.pre. Missing discharge dates due administrative truncation in 2019, imputed 2.1.1.a.Less16|More90, removed rows due to >2 |diff| ;2.1.1.c.Multiple common dates found. Select the birth date closest to available external records;3.1. Collapsed Treatment Plans;3.4. Invalid Age Of Onset of Substance use, <5 yrs old

Provisionally, we generated the database SISTRAT23_c1_2010_2022_df_prev1n, which removes duplicates which were of more than 1095 days, within another treatment, if there are financed by SENDA and other not financed by SENDA, kept financed by SENDA, in order to continue reviewing the observations and normalizing data in other relevant aspects.

Code
SISTRAT23_c1_2010_2022_df_prev1n<-
  SISTRAT23_c1_2010_2022_df_prev1m |> 
  #greater than 1095 days
  dplyr::filter(!(rn %in% subset(CONS_C1_df_dup_intervals_after_miss_less30d_0d_center_id_multiple_overlaps, dit_1>1095, "rn_1")))|> 
  dplyr::filter(!(rn %in% subset(CONS_C1_df_dup_intervals_after_miss_less30d_0d_center_id_multiple_overlaps, dit_2>1095, "rn_2")))|>   
  #tr. within the other
  dplyr::filter(!(rn %in% subset(CONS_C1_df_dup_intervals_after_miss_less30d_0d_center_id_multiple_overlaps, trat_1_within_2==1, "rn_1")))|> 
  dplyr::filter(!(rn %in% subset(CONS_C1_df_dup_intervals_after_miss_less30d_0d_center_id_multiple_overlaps, trat_2_within_1==1, "rn_2")))|>
  #senda Yes
  dplyr::filter(!(rn %in% subset(CONS_C1_df_dup_intervals_after_miss_less30d_0d_center_id_multiple_overlaps, senda_1=="no" & senda_2=="si", "rn_1")))|>                  
 dplyr::filter(!(rn %in% subset(CONS_C1_df_dup_intervals_after_miss_less30d_0d_center_id_multiple_overlaps, senda_1=="si" & senda_2=="no", "rn_2"))) |> 
    #;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;
        (\(df) {
    cat(paste0("4.0xx. Database after eliminating remanent duplicates, cases: ", formatC(nrow(df), big.mark=",")),"\n")
    cat(paste0("4.0xx. Database after eliminating remanent duplicates, RUNs: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
    if (nrow(df) > nrow(SISTRAT23_c1_2010_2022_df_prev1m))stop("Error: Added treatment episodes in the process")
    df
  })()
4.0xx. Database after eliminating remanent duplicates, cases: 150,046 
4.0xx. Database after eliminating remanent duplicates, RUNs: 106,283 

1. Data Editing / Deductive Imputation

1.1. DSM/ICD-10

Some cases did not have a primary diagnosis in DSM-IV notation but have a secondary (n= 604) or tertiary but no secondary (n= 20).

The data uses a nested structure for main and sub-diagnostic categories. When an episode had a main diagnosis but was missing a sub-diagnosis, we inserted ‘NA’ as a placeholder (“NA_placeholder_”). We then removed duplicate entries among the second or third pair of main and sub-diagnoses. After this cleaning step, the diagnoses for each episode were concatenated

The replacement of DSM-IV diagnoses with ICD-10 codes is not recommended for our analysis yet. We lack documentation on the source of any intersection between these classification systems, and 31 sub-diagnoses have no direct equivalents between the two systems. This inconsistency would compromise the validity of our diagnostic categorization and subsequent analyses..

The main diagnoses and sub-diagnoses for ICD-10 and DSM-IV classification systems were combined into the mod_psiq_cie_10_or and mod_psiq_dsm_iv_or columns, respectively. In the future (step 4), they should be separated by column.

Additionally, the columns with suffixes _instudy (detects any “in study”), _no_dg (detects any “no disorder”), and _dg (detects any valid diagnostic) enable the identification of records where categories such as “sin trastorno” (no disorder) and “en estudio” (under study) can be removed, as these designations provide no clinical value when they occur alongside established diagnoses _dg).

Code
names_dg_dsmiv<-
c("diagnostico_trs_psiquiatrico_dsm_iv", "diagnostico_trs_psiquiatrico_sub_dsm_iv", 
"x2_diagnostico_trs_psiquiatrico_dsm_iv", "x2_diagnostico_trs_psiquiatrico_sub_dsm_iv", 
"x3_diagnostico_trs_psiquiatrico_dsm_iv", "x3_diagnostico_trs_psiquiatrico_sub_dsm_iv")
names_dg_icd10<- c("diagnostico_trs_psiquiatrico_cie_10", "diagnostico_trs_psiquiatrico_sub_cie_10", 
"x2_diagnostico_trs_psiquiatrico_cie_10", "x2_diagnostico_trs_psiquiatrico_sub_cie_10", 
"x3_diagnostico_trs_psiquiatrico_cie_10", "x3_diagnostico_trs_psiquiatrico_sub_cie_10", 
"diagnostico_trastorno_psiquiatrico_cie_10_al_egreso")

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
  #dg_trs_psiq_sub_cie_10_or x2_dg_trs_psiq_sub_cie_10_or x3_dg_trs_psiq_sub_cie_10_or
cat("Cases with sub-diagnostics but without the main: DSM-IV\n")
SISTRAT23_c1_2010_2022_df_prev1n|> filter(is.na(diagnostico_trs_psiquiatrico_dsm_iv) & !is.na(diagnostico_trs_psiquiatrico_sub_dsm_iv))|> select(c("hash_key",any_of(names_dg_dsmiv)))|> nrow()
cat("second dg.\n")
SISTRAT23_c1_2010_2022_df_prev1n|> filter(is.na(x2_diagnostico_trs_psiquiatrico_dsm_iv) & !is.na(x2_diagnostico_trs_psiquiatrico_sub_dsm_iv))|> select(c("hash_key",any_of(names_dg_dsmiv)))|> nrow()
cat("third dg.\n")
SISTRAT23_c1_2010_2022_df_prev1n|> filter(is.na(x3_diagnostico_trs_psiquiatrico_dsm_iv) & !is.na(x3_diagnostico_trs_psiquiatrico_sub_dsm_iv))|> select(c("hash_key",any_of(names_dg_dsmiv)))|> nrow()
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
cat("Cases with sub-diagnostics but without the main: ICD-10\n")
SISTRAT23_c1_2010_2022_df_prev1n|> filter(is.na(diagnostico_trs_psiquiatrico_cie_10) & !is.na(diagnostico_trs_psiquiatrico_sub_cie_10))|> select(c("hash_key",any_of(names_dg_icd10)))|> nrow()
cat("second dg.\n")
SISTRAT23_c1_2010_2022_df_prev1n|> filter(is.na(x2_diagnostico_trs_psiquiatrico_cie_10) & !is.na(x2_diagnostico_trs_psiquiatrico_sub_cie_10))|> select(c("hash_key",any_of(names_dg_icd10)))|> nrow()
cat("third dg.\n")
SISTRAT23_c1_2010_2022_df_prev1n|> filter(is.na(x3_diagnostico_trs_psiquiatrico_cie_10) & !is.na(x3_diagnostico_trs_psiquiatrico_sub_cie_10))|> select(c("hash_key",any_of(names_dg_icd10)))|> nrow()
# 3 cases with sub-diagnostics
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
  #dg_trs_psiq_sub_cie_10_or x2_dg_trs_psiq_sub_cie_10_or x3_dg_trs_psiq_sub_cie_10_or
cat("Cases with sub-diagnostics but without the main: DSM-IV\n")
SISTRAT23_c1_2010_2022_df_prev1n|> filter(is.na(diagnostico_trs_psiquiatrico_dsm_iv) & !is.na(diagnostico_trs_psiquiatrico_sub_dsm_iv))|> select(c("hash_key",any_of(names_dg_dsmiv)))|> nrow()
cat("second dg.\n")
SISTRAT23_c1_2010_2022_df_prev1n|> filter(is.na(x2_diagnostico_trs_psiquiatrico_dsm_iv) & !is.na(x2_diagnostico_trs_psiquiatrico_sub_dsm_iv))|> select(c("hash_key",any_of(names_dg_dsmiv)))|> nrow()
cat("third dg.\n")
SISTRAT23_c1_2010_2022_df_prev1n|> filter(is.na(x3_diagnostico_trs_psiquiatrico_dsm_iv) & !is.na(x3_diagnostico_trs_psiquiatrico_sub_dsm_iv))|> select(c("hash_key",any_of(names_dg_dsmiv)))|> nrow()
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
cat("Cases with sub-diagnostics but without the main, or the main in study or with explicit non-classification (sin trastorno): ICD-10\n")
SISTRAT23_c1_2010_2022_df_prev1n|> filter((is.na(diagnostico_trs_psiquiatrico_cie_10) |
    diagnostico_trs_psiquiatrico_cie_10 == "en estudio" |
    diagnostico_trs_psiquiatrico_cie_10 == "sin trastorno") & !is.na(diagnostico_trs_psiquiatrico_sub_cie_10))|> select(c("hash_key",any_of(names_dg_icd10))) |> 
  nrow()
cat("second dg.\n")
SISTRAT23_c1_2010_2022_df_prev1n|> filter((is.na(x2_diagnostico_trs_psiquiatrico_cie_10) |
    x2_diagnostico_trs_psiquiatrico_cie_10 == "en estudio" |
    x2_diagnostico_trs_psiquiatrico_cie_10 == "sin trastorno") & !is.na(x2_diagnostico_trs_psiquiatrico_sub_cie_10))|> select(c("hash_key",any_of(names_dg_icd10))) |> 
  nrow()
cat("third dg.\n")
SISTRAT23_c1_2010_2022_df_prev1n|> filter((is.na(x3_diagnostico_trs_psiquiatrico_cie_10) |
    x3_diagnostico_trs_psiquiatrico_cie_10 == "en estudio" |
    x3_diagnostico_trs_psiquiatrico_cie_10 == "sin trastorno") & !is.na(x3_diagnostico_trs_psiquiatrico_sub_cie_10))|> select(c("hash_key",any_of(names_dg_icd10))) |> 
  nrow()

cat("Cases with sub-diagnostics but without the main, or the main in study or with explicit non-classification (sin trastorno): DSM-IV\n")
SISTRAT23_c1_2010_2022_df_prev1n|> filter((is.na(diagnostico_trs_psiquiatrico_dsm_iv) |
    diagnostico_trs_psiquiatrico_dsm_iv == "en estudio" |
    diagnostico_trs_psiquiatrico_dsm_iv == "sin trastorno") & !is.na(diagnostico_trs_psiquiatrico_sub_dsm_iv))|> select(c("hash_key",any_of(names_dg_dsmiv))) |> 
  nrow()
cat("second dg.\n")
SISTRAT23_c1_2010_2022_df_prev1n|> filter((is.na(x2_diagnostico_trs_psiquiatrico_dsm_iv) |
    x2_diagnostico_trs_psiquiatrico_dsm_iv == "en estudio" |
    x2_diagnostico_trs_psiquiatrico_dsm_iv == "sin trastorno") & !is.na(x2_diagnostico_trs_psiquiatrico_sub_dsm_iv))|> select(c("hash_key",any_of(names_dg_dsmiv))) |> 
  nrow()
cat("third dg.\n")
SISTRAT23_c1_2010_2022_df_prev1n|> filter((is.na(x3_diagnostico_trs_psiquiatrico_dsm_iv) |
    x3_diagnostico_trs_psiquiatrico_dsm_iv == "en estudio" |
    x3_diagnostico_trs_psiquiatrico_dsm_iv == "sin trastorno") & !is.na(x3_diagnostico_trs_psiquiatrico_sub_dsm_iv))|> select(c("hash_key",any_of(names_dg_dsmiv))) |> 
  nrow() #4
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:

invisible("Primero solucionar el problema de arriba: clasificaciones con en estudio, pero la subclasificación con diagnóstico (agregarle como condición que la subclas tenga también no_NAs")

#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
cat("to standardize the main category with the DSM-IV subcategory\n")
dg_trs_psiq_dsm_iv_sub_tab<-
  SISTRAT23_c1_2010_2022_df_prev1n|>
  mutate(dsm_concat= paste0(diagnostico_trs_psiquiatrico_dsm_iv, "_", diagnostico_trs_psiquiatrico_sub_dsm_iv))|> 
  mutate(x2_dsm_concat= paste0(x2_diagnostico_trs_psiquiatrico_dsm_iv, "_", x2_diagnostico_trs_psiquiatrico_sub_dsm_iv))|> 
  mutate(x3_dsm_concat= paste0(x3_diagnostico_trs_psiquiatrico_dsm_iv, "_", x3_diagnostico_trs_psiquiatrico_sub_dsm_iv))|> 
  select(ends_with("dsm_concat")) |> 
  pivot_longer(
    cols = everything(),
    names_to = "concat_type",
    values_to = "concat_value"
  )|>
  select(-concat_type)|> 
  janitor::tabyl(concat_value)|> 
  data.frame()|>
  arrange(desc(n))|> 
  select(concat_value)|> 
  #only useful links
  filter(!grepl("_NA$", concat_value))|>
  filter(!grepl("^NA_", concat_value))|>
  filter(!grepl("en estudio", concat_value))|>
  filter(!grepl("sin trastorno", concat_value))|> 
  tidyr::separate(concat_value, into = c("main", "sub"), sep = "_")

if(dg_trs_psiq_dsm_iv_sub_tab |> 
         group_by(sub) |>
         summarise(main=n_distinct(main)) |> 
         filter(main>1) |> nrow()<0){stop("more than one main category in one sub diagnostic")}
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
cat("to standardize the main category with the DSM-IV subcategory\n")
dg_trs_psiq_icd10_sub_tab<-
  SISTRAT23_c1_2010_2022_df_prev1n|>
  mutate(icd_concat= paste0(diagnostico_trs_psiquiatrico_cie_10, "_", diagnostico_trs_psiquiatrico_sub_cie_10))|> 
  mutate(x2_icd_concat= paste0(x2_diagnostico_trs_psiquiatrico_cie_10, "_", x2_diagnostico_trs_psiquiatrico_sub_cie_10))|> 
  mutate(x3_icd_concat= paste0(x3_diagnostico_trs_psiquiatrico_cie_10, "_", x3_diagnostico_trs_psiquiatrico_sub_cie_10))|> 
  select(ends_with("icd_concat")) |> 
  pivot_longer(
    cols = everything(),
    names_to = "concat_type",
    values_to = "concat_value"
  )|>
  select(-concat_type)|> 
  janitor::tabyl(concat_value)|> 
  data.frame()|>
  arrange(desc(n))|> 
  select(concat_value)|> 
  #only useful links
  filter(!grepl("_NA$", concat_value))|>
  filter(!grepl("^NA_", concat_value))|>
  filter(!grepl("en estudio", concat_value))|>
  filter(!grepl("sin trastorno", concat_value))|> 
  tidyr::separate(concat_value, into = c("main", "sub"), sep = "_")

if(dg_trs_psiq_icd10_sub_tab |> 
         group_by(sub) |>
         summarise(main=n_distinct(main)) |> 
         filter(main>1) |> nrow()<0){stop("more than one main category in one sub diagnostic")}

#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
cat("remove redundancies and duplicates in diagnoses\n")
#_______________#_____________________#______________#
#_______________#_____________________#______________#
#_______________#_____________________#______________#

invisible("Put placeholder to make replacements")
SISTRAT23_c1_2010_2022_df_prev1n_mod1<-
SISTRAT23_c1_2010_2022_df_prev1n|> 
    mutate(dg_trs_psiq_dsm_iv_or = case_when(
    !is.na(diagnostico_trs_psiquiatrico_sub_dsm_iv) &
      diagnostico_trs_psiquiatrico_sub_dsm_iv != "en estudio" &
      diagnostico_trs_psiquiatrico_sub_dsm_iv != "sin trastorno" ~
      # Perform the lookup here
      dg_trs_psiq_dsm_iv_sub_tab$main[match(diagnostico_trs_psiquiatrico_sub_dsm_iv, dg_trs_psiq_dsm_iv_sub_tab$sub)],
    TRUE ~ diagnostico_trs_psiquiatrico_dsm_iv # Keep the original value otherwise
    #to explore differences and origin
  ))|> #filter(is.na(dg_trs_psiq_dsm_iv_or), !is.na(diagnostico_trs_psiquiatrico_dsm_iv)) |> select(c("hash_key","dg_trs_psiq_dsm_iv_or","dg_trs_psiq_sub_dsm_iv_or", any_of(names_dg_dsmiv)))|> View()
  mutate(x2_dg_trs_psiq_dsm_iv_or = case_when(
    !is.na(x2_diagnostico_trs_psiquiatrico_sub_dsm_iv) &
      x2_diagnostico_trs_psiquiatrico_sub_dsm_iv != "en estudio" &
      x2_diagnostico_trs_psiquiatrico_sub_dsm_iv != "sin trastorno" &
      x2_diagnostico_trs_psiquiatrico_sub_dsm_iv != "NA_placeholder" ~
      dg_trs_psiq_dsm_iv_sub_tab$main[match(x2_diagnostico_trs_psiquiatrico_sub_dsm_iv, dg_trs_psiq_dsm_iv_sub_tab$sub)],
    TRUE ~ x2_diagnostico_trs_psiquiatrico_dsm_iv 
  ))|> #filter(x2_dg_trs_psiq_dsm_iv_or!=x2_diagnostico_trs_psiquiatrico_dsm_iv) |> select(c("hash_key","x2_dg_trs_psiq_dsm_iv_or", any_of(names_dg_dsmiv)))|> glimpse()
  mutate(x3_dg_trs_psiq_dsm_iv_or = case_when(
    !is.na(x3_diagnostico_trs_psiquiatrico_sub_dsm_iv) &
      x3_diagnostico_trs_psiquiatrico_sub_dsm_iv != "en estudio" &
      x3_diagnostico_trs_psiquiatrico_sub_dsm_iv != "sin trastorno" ~
      dg_trs_psiq_dsm_iv_sub_tab$main[match(x3_diagnostico_trs_psiquiatrico_sub_dsm_iv, dg_trs_psiq_dsm_iv_sub_tab$sub)],
    TRUE ~ x3_diagnostico_trs_psiquiatrico_dsm_iv 
  ))|> #filter(x3_dg_trs_psiq_dsm_iv_or!=x3_diagnostico_trs_psiquiatrico_dsm_iv) |> select(c("hash_key","x3_dg_trs_psiq_dsm_iv_or", any_of(names_dg_dsmiv)))|> glimpse()
  #:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
  #now with ICD-10 classifications
  #:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
  mutate(dg_trs_psiq_cie_10_or = case_when(
    !is.na(diagnostico_trs_psiquiatrico_sub_cie_10) &
      diagnostico_trs_psiquiatrico_sub_cie_10 != "en estudio" &
      diagnostico_trs_psiquiatrico_sub_cie_10 != "sin trastorno" ~
      dg_trs_psiq_icd10_sub_tab$main[match(diagnostico_trs_psiquiatrico_sub_cie_10, dg_trs_psiq_icd10_sub_tab$sub)],
    TRUE ~ diagnostico_trs_psiquiatrico_cie_10 
  ))|> #filter(dg_trs_psiq_cie_10_or!=diagnostico_trs_psiquiatrico_cie_10) |> select(c("hash_key","dg_trs_psiq_cie_10_or", any_of(names_dg_icd10)))|> glimpse()
  mutate(x2_dg_trs_psiq_cie_10_or = case_when(
    !is.na(x2_diagnostico_trs_psiquiatrico_sub_cie_10) &
      x2_diagnostico_trs_psiquiatrico_sub_cie_10 != "en estudio" &
      x2_diagnostico_trs_psiquiatrico_sub_cie_10 != "sin trastorno" ~
      dg_trs_psiq_icd10_sub_tab$main[match(x2_diagnostico_trs_psiquiatrico_sub_cie_10, dg_trs_psiq_icd10_sub_tab$sub)],
    TRUE ~ x2_diagnostico_trs_psiquiatrico_cie_10 
  ))|> #filter(x2_dg_trs_psiq_cie_10_or!=x2_diagnostico_trs_psiquiatrico_cie_10) |> select(c("hash_key","x2_dg_trs_psiq_cie_10_or", any_of(names_dg_icd10)))|> glimpse()
  mutate(x3_dg_trs_psiq_cie_10_or = case_when(
    !is.na(x3_diagnostico_trs_psiquiatrico_sub_cie_10) &
      x3_diagnostico_trs_psiquiatrico_sub_cie_10 != "en estudio" &
      x3_diagnostico_trs_psiquiatrico_sub_cie_10 != "sin trastorno" ~
      dg_trs_psiq_icd10_sub_tab$main[match(x3_diagnostico_trs_psiquiatrico_sub_cie_10, dg_trs_psiq_icd10_sub_tab$sub)],
    TRUE ~ x3_diagnostico_trs_psiquiatrico_cie_10 
  ))|> #filter(x3_dg_trs_psiq_cie_10_or!=x3_diagnostico_trs_psiquiatrico_cie_10) |> select(c("hash_key","x3_dg_trs_psiq_cie_10_or", any_of(names_dg_icd10)))|> glimpse()
  #:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
  #in case of main dg but no sub-dg, DSM-IV: this empty field will be respected in the future in case replacing diagnoses
  #:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
  mutate(dg_trs_psiq_sub_dsm_iv_or = case_when(
    is.na(diagnostico_trs_psiquiatrico_sub_dsm_iv) &
    !is.na(dg_trs_psiq_dsm_iv_or) &
    diagnostico_trs_psiquiatrico_dsm_iv != "en estudio" &
    diagnostico_trs_psiquiatrico_dsm_iv != "sin trastorno" ~ "NA_placeholder", # Or whatever value you want to assign
    TRUE ~ diagnostico_trs_psiquiatrico_sub_dsm_iv # Keep the original value otherwise
  ))|> #filter(dg_trs_psiq_sub_dsm_iv_or=="NA_placeholder") |> select(c("hash_key","dg_trs_psiq_sub_dsm_iv_or", any_of(names_dg_dsmiv)))|> glimpse()
  mutate(x2_dg_trs_psiq_sub_dsm_iv_or = case_when(
    is.na(x2_diagnostico_trs_psiquiatrico_sub_dsm_iv) &
    !is.na(x2_dg_trs_psiq_dsm_iv_or) &
    x2_diagnostico_trs_psiquiatrico_dsm_iv != "en estudio" &
    x2_diagnostico_trs_psiquiatrico_dsm_iv != "sin trastorno" ~ "NA_placeholder", # Or whatever value you want to assign
    TRUE ~ x2_diagnostico_trs_psiquiatrico_sub_dsm_iv # Keep the original value otherwise
  ))|> #filter(x2_dg_trs_psiq_sub_dsm_iv_or=="NA_placeholder") |> select(c("hash_key","x2_dg_trs_psiq_sub_dsm_iv_or", any_of(names_dg_dsmiv)))|> glimpse()
  mutate(x3_dg_trs_psiq_sub_dsm_iv_or = case_when(
    is.na(x3_diagnostico_trs_psiquiatrico_sub_dsm_iv) &
    !is.na(x3_dg_trs_psiq_dsm_iv_or) &
    x3_diagnostico_trs_psiquiatrico_dsm_iv != "en estudio" &
    x3_diagnostico_trs_psiquiatrico_dsm_iv != "sin trastorno" ~ "NA_placeholder", # Or whatever value you want to assign
    TRUE ~ x3_diagnostico_trs_psiquiatrico_sub_dsm_iv # Keep the original value otherwise
  ))|> #filter(x3_dg_trs_psiq_sub_dsm_iv_or=="NA_placeholder") |> select(c("hash_key","x3_dg_trs_psiq_sub_dsm_iv_or", any_of(names_dg_dsmiv)))|> glimpse()  
  mutate(dg_trs_psiq_sub_cie_10_or = case_when(
    is.na(diagnostico_trs_psiquiatrico_sub_cie_10) &
    !is.na(dg_trs_psiq_cie_10_or) &
    diagnostico_trs_psiquiatrico_cie_10 != "en estudio" &
    diagnostico_trs_psiquiatrico_cie_10 != "sin trastorno" ~ "NA_placeholder", # Or whatever value you want to assign
    TRUE ~ diagnostico_trs_psiquiatrico_sub_cie_10 # Keep the original value otherwise
  ))|> 
  mutate(x2_dg_trs_psiq_sub_cie_10_or = case_when(
    is.na(x2_diagnostico_trs_psiquiatrico_sub_cie_10) &
    !is.na(x2_dg_trs_psiq_cie_10_or) &
    x2_diagnostico_trs_psiquiatrico_cie_10 != "en estudio" &
    x2_diagnostico_trs_psiquiatrico_cie_10 != "sin trastorno" ~ "NA_placeholder", # Or whatever value you want to assign
    TRUE ~ x2_diagnostico_trs_psiquiatrico_sub_cie_10 # Keep the original value otherwise
  ))|> 
  mutate(x3_dg_trs_psiq_sub_cie_10_or = case_when(
    is.na(x3_diagnostico_trs_psiquiatrico_sub_cie_10) &
    !is.na(x3_dg_trs_psiq_cie_10_or) &
    x3_diagnostico_trs_psiquiatrico_cie_10 != "en estudio" &
    x3_diagnostico_trs_psiquiatrico_cie_10 != "sin trastorno" ~ "NA_placeholder", # Or whatever value you want to assign
    TRUE ~ x3_diagnostico_trs_psiquiatrico_sub_cie_10 # Keep the original value otherwise
  ))|> #filter(x3_dg_trs_psiq_sub_cie_10_or=="NA_placeholder") |> select(c("hash_key","x3_dg_trs_psiq_cie_10_or", any_of(names_dg_icd10)))|> glimpse()  
  #:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
  # Collapse and separate main
  #:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
  # First create combined diagnosis pairs for comparison
  mutate(
    # Create diagnosis pairs for comparison
    diag_pair_1 = paste(dg_trs_psiq_cie_10_or, 
                        dg_trs_psiq_sub_cie_10_or, sep = "::"),
    diag_pair_2 = paste(x2_dg_trs_psiq_cie_10_or, 
                        x2_dg_trs_psiq_sub_cie_10_or, sep = "::"),
    diag_pair_3 = paste(x3_dg_trs_psiq_cie_10_or, 
                        x3_dg_trs_psiq_sub_cie_10_or, sep = "::"),
    # Now flag duplicates for removal
    keep_pair_2 = diag_pair_2 != diag_pair_1,
    keep_pair_3 = diag_pair_3 != diag_pair_1 & diag_pair_3 != diag_pair_2
  )|>
  # Apply the duplicate filtering
  mutate(
    # Set the filtered columns based on duplicate flags
    # prevents the same diagnosis from being counted multiple times when we combine them into the final concatenated field
    x2_diag_filtered = if_else(keep_pair_2, x2_dg_trs_psiq_cie_10_or, NA_character_),
    x2_subdiag_filtered = if_else(keep_pair_2, x2_dg_trs_psiq_sub_cie_10_or, NA_character_),
    x3_diag_filtered = if_else(keep_pair_3, x3_dg_trs_psiq_cie_10_or, NA_character_),
    x3_subdiag_filtered = if_else(keep_pair_3, x3_dg_trs_psiq_sub_cie_10_or, NA_character_)
  )

#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
sub_dsm_iv_to_cie_10_comp_table <- rio::import(paste0(wdpath,"cons/_input/sub_dsm_iv_to_cie_10_comp_table.xlsx"))|> 
  # minusc, we changed tildes
  mutate(across(where(is.character), 
                ~stringi::stri_trans_general(., "Latin-ASCII")))  
invisible("Is not very useful to replace DSM-IV for ICD-10 codes. We dont know the source of the homologation and 31 sub-diagnostics are not homologued")
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
cat("merge clean diagnoses\n")
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_

invisible({
  SISTRAT23_c1_2010_2022_df_prev1n_mod1[, c("diag1", "diag2", "diag3") := .(NA_character_, NA_character_, NA_character_)]
  SISTRAT23_c1_2010_2022_df_prev1n_mod1[, c("diag1", "diag2", "diag3") := .(
    # First diagnosis
    ifelse(!is.na(dg_trs_psiq_cie_10_or) & 
           !dg_trs_psiq_cie_10_or %in% c("en estudio", "sin trastorno"),
           paste0(dg_trs_psiq_cie_10_or, "::", 
                  ifelse(!is.na(dg_trs_psiq_sub_cie_10_or), 
                         dg_trs_psiq_sub_cie_10_or, "NA")),
           NA_character_),
    # Second diagnosis 
    ifelse(!is.na(x2_diag_filtered) & 
           !x2_diag_filtered %in% c("en estudio", "sin trastorno"),
           paste0(x2_diag_filtered, "::", 
                  ifelse(!is.na(x2_subdiag_filtered), 
                         x2_subdiag_filtered, "NA")),
           NA_character_),
    # Third diagnosis
    ifelse(!is.na(x3_diag_filtered) & 
           !x3_diag_filtered %in% c("en estudio", "sin trastorno"),
           paste0(x3_diag_filtered, "::", 
                  ifelse(!is.na(x3_subdiag_filtered), 
                         x3_subdiag_filtered, "NA")),
           NA_character_)
  )]
  
  SISTRAT23_c1_2010_2022_df_prev1n_mod1[, mod_psiq_cie_10_or := {
    tmp <- na.omit(c(diag1, diag2, diag3))
    ifelse(length(tmp) > 0, paste(tmp, collapse = "; "), NA_character_)
  }, by=.I] #to operate by row
})

invisible("Function that may work well in the future (step 4)")
# Custom function to extract components in one pass
# SISTRAT23_c1_2010_2022_df_prev1n_mod1[, c(
#   "dg_cie_10_main_1", "dg_cie_10_sub_1",
#   "dg_cie_10_main_2", "dg_cie_10_sub_2", 
#   "dg_cie_10_main_3", "dg_cie_10_sub_3"
# ) := {
#   parts <- strsplit(mod_cie_10_or, "\\s*;\\s*")[[1]]
#   
#   # Initialize with NAs
#   result <- rep(NA_character_, 6)
#   
#   # Parse up to 3 diagnoses
#   for (i in 1:min(length(parts), 3)) {
#     if (!is.na(parts[i])) {
#       subparts <- strsplit(parts[i], "\\s*::\\s*")[[1]]
#       result[2*i-1] <- subparts[1]
#       result[2*i] <- if (length(subparts) > 1) subparts[2] else NA_character_
#     }
#   }
#   as.list(result)
# }]

# Remove memory
gc()

# Remove temporary columns if needed
invisible({
  SISTRAT23_c1_2010_2022_df_prev1n_mod1[, c("diag1", "diag2", "diag3") := NULL]
})

SISTRAT23_c1_2010_2022_df_prev1n_mod2<-
SISTRAT23_c1_2010_2022_df_prev1n_mod1|> 
  rowwise()|> 
  # Detect any in study or diagnostic of no disorder detected
  mutate(dg_psiq_cie_10_instudy = any(c_across(c(dg_trs_psiq_cie_10_or, x2_diag_filtered, x3_diag_filtered)) %in% c("en estudio")))|> 
  mutate(dg_psiq_cie_10_no_dg = any(c_across(c(dg_trs_psiq_cie_10_or, x2_diag_filtered, x3_diag_filtered)) %in% c("sin trastorno")))|>
  # Any diagnostic different than in study or non-detected
  mutate(dg_psiq_cie_10_dg = any(!is.na(c_across(c(dg_trs_psiq_cie_10_or, x2_diag_filtered, x3_diag_filtered))) & !(c_across(c(dg_trs_psiq_cie_10_or, x2_diag_filtered, x3_diag_filtered)) %in% c("en estudio", "sin trastorno"))))|>
  ungroup()|>
  select(-contains("_pair_"), -ends_with("_filtered"))|> 
# replace with NA
  #select(c("hash_key", contains("cie_10")))|> slice(95:100)
  #:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
  # Collapse and separate main (NOW FOR DSM-IV DIAGNOSES)
  #:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
  # First create combined diagnosis pairs for comparison
  mutate(
    # Create diagnosis pairs for comparison
    diag_pair_1 = paste(dg_trs_psiq_dsm_iv_or, 
                        dg_trs_psiq_sub_dsm_iv_or, sep = "::"),
    diag_pair_2 = paste(x2_dg_trs_psiq_dsm_iv_or, 
                        x2_dg_trs_psiq_sub_dsm_iv_or, sep = "::"),
    diag_pair_3 = paste(x3_dg_trs_psiq_dsm_iv_or, 
                        x3_dg_trs_psiq_sub_dsm_iv_or, sep = "::"),
    # Now flag duplicates for removal
    keep_pair_2 = diag_pair_2 != diag_pair_1,
    keep_pair_3 = diag_pair_3 != diag_pair_1 & diag_pair_3 != diag_pair_2
  )|>
  # Apply the duplicate filtering
  mutate(
    # Set the filtered columns based on duplicate flags
    # prevents the same diagnosis from being counted multiple times when we combine them into the final concatenated field
    x2_diag_filtered = if_else(keep_pair_2, x2_dg_trs_psiq_dsm_iv_or, NA_character_),
    x2_subdiag_filtered = if_else(keep_pair_2, x2_dg_trs_psiq_sub_dsm_iv_or, NA_character_),
    x3_diag_filtered = if_else(keep_pair_3, x3_dg_trs_psiq_dsm_iv_or, NA_character_),
    x3_subdiag_filtered = if_else(keep_pair_3, x3_dg_trs_psiq_sub_dsm_iv_or, NA_character_)
  )

invisible({
  SISTRAT23_c1_2010_2022_df_prev1n_mod2[, c("diag1_dsm", "diag2_dsm", "diag3_dsm") := .(
    # First diagnosis
    ifelse(!is.na(dg_trs_psiq_dsm_iv_or) & 
           !dg_trs_psiq_dsm_iv_or %in% c("en estudio", "sin trastorno"),
           paste0(dg_trs_psiq_dsm_iv_or, "::", 
                  ifelse(!is.na(dg_trs_psiq_sub_dsm_iv_or), 
                         dg_trs_psiq_sub_dsm_iv_or, "NA")),
           NA_character_),
    # Second diagnosis 
    ifelse(!is.na(x2_diag_filtered) & 
           !x2_diag_filtered %in% c("en estudio", "sin trastorno"),
           paste0(x2_diag_filtered, "::", 
                  ifelse(!is.na(x2_subdiag_filtered), 
                         x2_subdiag_filtered, "NA")),
           NA_character_),
    # Third diagnosis
    ifelse(!is.na(x3_diag_filtered) & 
           !x3_diag_filtered %in% c("en estudio", "sin trastorno"),
           paste0(x3_diag_filtered, "::", 
                  ifelse(!is.na(x3_subdiag_filtered), 
                         x3_subdiag_filtered, "NA")),
           NA_character_)
  )]
  
  SISTRAT23_c1_2010_2022_df_prev1n_mod2[, mod_psiq_dsm_iv_or := {
    tmp_dsm <- na.omit(c(diag1_dsm, diag2_dsm, diag3_dsm))
    ifelse(length(tmp_dsm) > 0, paste(tmp_dsm, collapse = "; "), NA_character_)
  }, by=.I] #to operate by row
})

#Remove diag columns
invisible({
  SISTRAT23_c1_2010_2022_df_prev1n_mod2[, c("diag1_dsm", "diag2_dsm", "diag3_dsm") := NULL]
})


SISTRAT23_c1_2010_2022_df_prev1n_mod3<-
SISTRAT23_c1_2010_2022_df_prev1n_mod2|> 
  rowwise()|> 
  # Detect any in study or diagnostic of no disorder detected
  mutate(dg_psiq_dsm_iv_instudy = any(c_across(c(dg_trs_psiq_dsm_iv_or, x2_diag_filtered, x3_diag_filtered)) %in% c("en estudio")))|> 
  mutate(dg_psiq_dsm_iv_no_dg = any(c_across(c(dg_trs_psiq_dsm_iv_or, x2_diag_filtered, x3_diag_filtered)) %in% c("sin trastorno")))|>
  # Any diagnostic different than in study or non-detected
  mutate(dg_psiq_dsm_iv_dg = any(!is.na(c_across(c(dg_trs_psiq_dsm_iv_or, x2_diag_filtered, x3_diag_filtered))) & !(c_across(c(dg_trs_psiq_dsm_iv_or, x2_diag_filtered, x3_diag_filtered)) %in% c("en estudio", "sin trastorno"))))|>
  ungroup()|>
  select(-contains("_pair_"), -ends_with("_filtered")) 
# replace with NA
  #select(c("hash_key", contains("dsm_iv")))|> slice(95:100)
Cases with sub-diagnostics but without the main: DSM-IV
[1] 0
second dg.
[1] 0
third dg.
[1] 3
Cases with sub-diagnostics but without the main: ICD-10
[1] 0
second dg.
[1] 0
third dg.
[1] 0
Cases with sub-diagnostics but without the main: DSM-IV
[1] 0
second dg.
[1] 0
third dg.
[1] 3
Cases with sub-diagnostics but without the main, or the main in study or with explicit non-classification (sin trastorno): ICD-10
[1] 3
second dg.
[1] 0
third dg.
[1] 0
Cases with sub-diagnostics but without the main, or the main in study or with explicit non-classification (sin trastorno): DSM-IV
[1] 2
second dg.
[1] 0
third dg.
[1] 4
to standardize the main category with the DSM-IV subcategory
to standardize the main category with the DSM-IV subcategory
remove redundancies and duplicates in diagnoses
merge clean diagnoses
            used   (Mb) gc trigger   (Mb)  max used   (Mb)
Ncells   4556962  243.4    7364831  393.4   7364831  393.4
Vcells 657570067 5016.9 1027311897 7837.8 823777985 6285.0

1.2. Ethnicity

To generate a more inclusive approach to ethnic identification and assuming that ethnicity is invariable per person, all ethnicity records associated with each individual were consolidated from the original dataset (SISTRAT23_c1_2010_2022_df), preserving the diversity of self-identifications through semicolon-separated values. We excluded non-reported ethnicity data (inclusive_historical_ethnicity_by_run). We also added ethnicity data from C2 to C6. This variable is called ethnicity_c1_c6_historic. For more inclusion, we added information of ethnicity of databases of 2022 to 2024.

Code
inclusive_historical_ethnicity_by_run<-
SISTRAT23_c1_2010_2022_df |> 
  filter(etnia!="no pertenece", !is.na(etnia))|>
  group_by(hash_key) |>
  summarise(etnias_distinct = paste(unique(etnia), collapse = "; ")) |>
  ungroup() #|>filter(grepl(";",etnias_distinct))

inclusive_historical_ethnicity_by_run_2324<-
SISTRAT23_c1_2023_2024_df |> 
  filter(etnia!="no pertenece", !is.na(etnia))|>
  group_by(hashkey) |>
  summarise(etnias_distinct = paste(unique(etnia), collapse = "; ")) |>
  ungroup() #|>filter(grepl(";",etnias_distinct))

c2_inclusive_historical_ethnicity_by_run<-
CONS_C2 |> 
  filter(etnia!="no pertenece", !is.na(etnia)) |> 
  group_by(HASH_KEY) |>
  summarise(etnias_distinct = paste(unique(etnia), collapse = "; ")) |>
  ungroup() #|> filter(grepl(";",etnias_distinct))

c2_inclusive_historical_ethnicity_by_run_2324<-
c2_2324 |> 
  filter(etnia!="No pertenece", !is.na(etnia)) |> 
  group_by(hashkey) |>
  summarise(etnias_distinct = paste(unique(etnia), collapse = "; ")) |>
  ungroup() #|> filter(grepl(";",etnias_distinct))
c2_inclusive_historical_ethnicity_by_run_2324$etnias_distinct <- tolower(c2_inclusive_historical_ethnicity_by_run_2324$etnias_distinct)

c3_inclusive_historical_ethnicity_by_run<-
CONS_C3 |> 
  filter(etnia!="no pertenece", !is.na(etnia)) |> 
  group_by(HASH_KEY) |>
  summarise(etnias_distinct = paste(unique(etnia), collapse = "; ")) |>
  ungroup() #|> filter(grepl(";",etnias_distinct))

c4_inclusive_historical_ethnicity_by_run<-
CONS_C4 |> 
  filter(etnia!="no pertenece", !is.na(etnia)) |> 
  group_by(HASH_KEY) |>
  summarise(etnias_distinct = paste(unique(etnia), collapse = "; ")) |>
  ungroup() #|> filter(grepl(";",etnias_distinct))

c5_inclusive_historical_ethnicity_by_run<-
CONS_C5 |> 
  filter(etnia!="no pertenece", !is.na(etnia)) |> 
  group_by(HASH_KEY) |>
  summarise(etnias_distinct = paste(unique(etnia), collapse = "; ")) |>
  ungroup() #|> filter(grepl(";",etnias_distinct))

c6_inclusive_historical_ethnicity_by_run<-
  if(filter(CONS_C6, paisnacimiento=="no pertenece") |> nrow()>0){
CONS_C6 |>
  filter(paisnacimiento!="no pertenece", !is.na(paisnacimiento), paisnacimiento!="chile") |>
  group_by(HASH_KEY) |>
  summarise(etnias_distinct = paste(unique(paisnacimiento), collapse = "; ")) |>
  ungroup() #|> filter(grepl(";",etnias_distinct))
  } else {
  CONS_C6 |>
  filter(etnia!="no pertenece", !is.na(etnia), etnia!="chile") |>
  group_by(HASH_KEY) |>
  summarise(etnias_distinct = paste(unique(etnia), collapse = "; ")) |>
  ungroup() #|> filter(grepl(";",etnias_distinct))
}


SISTRAT23_c1_2010_2022_df_prev1o<-
SISTRAT23_c1_2010_2022_df_prev1n_mod3|> 
    (\(df) {
    cat(paste0("5.Number of cases after normalization of data editing: ", formatC(nrow(df), big.mark=",")),"\n")
    cat(paste0("5.Number of patients after normalization of data editing: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
    df
  })()|>
  left_join(inclusive_historical_ethnicity_by_run_2324, by=c("hash_key"="hashkey"), multiple="first")|>
  rename("ethnicity_inclusive2324"="etnias_distinct")|> 
  left_join(inclusive_historical_ethnicity_by_run, by="hash_key", multiple="first")|>
  rename("ethnicity_inclusive"="etnias_distinct")|> 
  left_join(c2_inclusive_historical_ethnicity_by_run, by=c("hash_key"="HASH_KEY"), multiple="first")|>
  rename("ethnicity_inclusive_c2"="etnias_distinct")|> 
  left_join(c2_inclusive_historical_ethnicity_by_run_2324, by=c("hash_key"="hashkey"), multiple="first")|>
  rename("ethnicity_inclusive_c2_2224"="etnias_distinct")|>   
  left_join(c3_inclusive_historical_ethnicity_by_run, by=c("hash_key"="HASH_KEY"), multiple="first")|>
  rename("ethnicity_inclusive_c3"="etnias_distinct")|> 
  left_join(c4_inclusive_historical_ethnicity_by_run, by=c("hash_key"="HASH_KEY"), multiple="first")|>
  rename("ethnicity_inclusive_c4"="etnias_distinct")|> 
  left_join(c5_inclusive_historical_ethnicity_by_run, by=c("hash_key"="HASH_KEY"), multiple="first")|>
  rename("ethnicity_inclusive_c5"="etnias_distinct")|> 
  left_join(c6_inclusive_historical_ethnicity_by_run, by=c("hash_key"="HASH_KEY"), multiple="first")|>
  rename("ethnicity_inclusive_c6"="etnias_distinct")|> 
    (\(df) {
    cat(paste0("5. After normalization and data editing, obs.: ", formatC(nrow(df), big.mark=",")),"\n")
    cat(paste0("5. After normalization and data editing, RUNs: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
    if (nrow(df) > nrow(SISTRAT23_c1_2010_2022_df_prev1n))stop("Error: Added treatment episodes in the process")
    df
  })()

SISTRAT23_c1_2010_2022_df_prev1o<-
# First, split each ethnicity column by "; ", extract unique values, and combine into a new column
SISTRAT23_c1_2010_2022_df_prev1o|>
  rowwise()|>
  mutate(
    ethnicity_c1_c6_historic = {
      # Get all columns starting with "ethnicity_inclusive_"
      # Use the names of the dataframe directly
      all_cols <- names(SISTRAT23_c1_2010_2022_df_prev1o)
      eth_cols <- c("ethnicity_inclusive", 
                   grep("^ethnicity_inclusive_", all_cols, value = TRUE))
      
      # Extract values from these columns that exist
      eth_values <- c()
      for (col in eth_cols) {
        if (col %in% all_cols) {
          val <- get(col)
          if (!is.na(val)) eth_values <- c(eth_values, val)
        }
      }
      
      # Split each value by semicolon and flatten
      if (length(eth_values) > 0) {
        all_eth <- unlist(strsplit(eth_values, "\\s*;\\s*"))
        paste(unique(all_eth), collapse = "; ")
      } else {
        NA_character_
      }
    }
  )|>
  ungroup()|>
  select(-any_of((starts_with("ethnicity_inclusive"))))
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:

#table(SISTRAT23_c1_2010_2022_df_prev1o$tipo_centro_derivacion, SISTRAT23_c1_2010_2022_df_prev1o$motivo_de_egreso)
#table(SISTRAT23_c1_2010_2022_df_prev1o$motivo_de_egreso_alta_administrativa, SISTRAT23_c1_2010_2022_df_prev1o$motivo_de_egreso)
5.Number of cases after normalization of data editing: 150,046 
5.Number of patients after normalization of data editing: 106,283 
5. After normalization and data editing, obs.: 150,046 
5. After normalization and data editing, RUNs: 106,283 

2. More than One Value within User, Concerning User-Invariant Variables

We need to obtain sociodemographic categories that are usually invariant for a given individual. Although this assumption is highly debatable, it allows us to detect inequalities stemming from these distinctions and their associations with social roles and stigmatization. For this purpose, we used external databases linked to SENDA agreements 2 through 6, together with hospitalization and Prosecutor’s Office databases.

Code
knitr::include_graphics(paste0(wdpath,"cons/_figs/same_hash_distinct_values_user_invariant.svg"))

2.1. Sex

  • Sex (sexo_2) (patients= 500). If there were only two observations with distinct sexes but in the same yearly dataset, we replaced the sex to “women” if the user has ever been in a type of program specifically for women, or if the user has ever been pregnant.

The primary approach differs depending on the availability of external data (count_not_na). The system first checks for clear agreement between internal (c1_perc_mujer) and external (perc_fem_ext) data. If both sources indicate a strong majority (>50%) for the same sex, that sex is assigned (Cases 6.a.a.1, 6.a.a.2). In cases of disagreement or ambiguity (e.g., one source shows a tie at 50%, or sources point to different sexes), the decision relies on comparing the quantity of records in each source (total vs. count_not_na). Generally, the source with more records is given higher weight.

When Only Internal Data is Available (count_not_na == 0), the decision relies solely on the internal data proportion (c1_perc_mujer) and the total number of internal records (total), and consider ties.

Code
decision_sex_inconsistencies<- 
DiagrammeR::grViz("
digraph decision_tree {
  graph [rankdir = TB, nodesep = 0.5]
  node [fontname = Helvetica, shape = diamond, width = 3.5, height = 1.2]
  edge [fontname = Helvetica]

  # Start node
  start [label = 'Start', shape = oval]

  # Main branches
  node0 [label = 'External data exists?\n(count_not_na > 0)']
  
  # ========== EXTERNAL DATA AVAILABLE (6.a) ==========
  subgraph cluster_external {
    label = 'External Data Available (count_not_na > 0)'
    color = blue
    
    node1 [label = 'Both internal & external >50% female?\n(c1_perc_mujer > 0.5 & perc_fem_ext > 0.5)']
    node2 [label = 'Both internal & external <50% female?\n(c1_perc_mujer < 0.5 & perc_fem_ext < 0.5)']
    node3 [label = 'Internal tie & external female majority &\nExternal data > internal?']
    node4 [label = 'Internal tie & external male majority &\nExternal data > internal?']
    node5 [label = 'Internal female majority & external tie &\nInternal data > external?']
    node6 [label = 'Internal male majority & external tie &\nInternal data > external?']
    node7 [label = 'Internal tie & external female majority &\nInternal data > external?']
    node8 [label = 'Internal tie & external male majority &\nInternal data > external?']
    node9 [label = 'External majority female & internal <50% &\nExternal data > internal?']
    node10 [label = 'External majority male & internal >=50% &\nExternal data > internal?']
    node11 [label = 'External tie & internal female majority &\nExternal data > internal?']
    node12 [label = 'External tie & internal male majority &\nExternal data > internal?']
    node13 [label = 'Conflicting majorities &\nEqual data quantity']
    node14 [label = 'Tie in both internal &\nexternal data']
    
    # Outcomes
    outcome1 [label = '6.a.a.1.female', shape = box]
    outcome2 [label = '6.a.a.2.male', shape = box]
    outcome3 [label = '6.a.b.1.female', shape = box]
    outcome4 [label = '6.a.b.2.male', shape = box]
    outcome5 [label = '6.a.b.1.female', shape = box]
    outcome6 [label = '6.a.b.2.male', shape = box]
    outcome7 [label = '6.a.b.1.female, but ask aux data', shape = box]
    outcome8 [label = '6.a.b.2.male, but ask aux data', shape = box]
    outcome9 [label = '6.a.b.1.female', shape = box]
    outcome10 [label = '6.a.b.2.male', shape = box]
    outcome11 [label = '6.a.b.1.female, but ask aux data', shape = box]
    outcome12 [label = '6.a.b.2.male, but ask aux data', shape = box]
    outcome13 [label = '6.a.b.3.nondet', shape = box]
    outcome14 [label = '6.a.c.nondet', shape = box]
    
    # Connections
    node0 -> node1 [label = 'Yes']
    
    node1 -> outcome1 [label = 'Yes']
    node1 -> node2 [label = 'No']
    
    node2 -> outcome2 [label = 'Yes']
    node2 -> node3 [label = 'No']
    
    node3 -> outcome3 [label = 'Yes']
    node3 -> node4 [label = 'No']
    
    node4 -> outcome4 [label = 'Yes']
    node4 -> node5 [label = 'No']
    
    node5 -> outcome5 [label = 'Yes']
    node5 -> node6 [label = 'No']
    
    node6 -> outcome6 [label = 'Yes']
    node6 -> node7 [label = 'No']
    
    node7 -> outcome7 [label = 'Yes']
    node7 -> node8 [label = 'No']
    
    node8 -> outcome8 [label = 'Yes']
    node8 -> node9 [label = 'No']
    
    node9 -> outcome9 [label = 'Yes']
    node9 -> node10 [label = 'No']
    
    node10 -> outcome10 [label = 'Yes']
    node10 -> node11 [label = 'No']
    
    node11 -> outcome11 [label = 'Yes']
    node11 -> node12 [label = 'No']
    
    node12 -> outcome12 [label = 'Yes']
    node12 -> node13 [label = 'No']
    
    node13 -> outcome13 [label = 'Yes']
    node13 -> node14 [label = 'No']
    
    node14 -> outcome14 [label = 'Yes']
  }

  # ========== NO EXTERNAL DATA (6.b) ==========
  subgraph cluster_internal {
    label = 'No External Data (count_not_na == 0)'
    color = green
    
    node15 [label = 'Total even?\n(total in 1-40 records\neven numbers)']
    node16 [label = 'Internal majority female?\n(c1_perc_mujer > 0.5)']
    node17 [label = 'Internal majority male?\n(c1_perc_mujer < 0.5)']
    node18 [label = 'Total odd?']
    
    # Outcomes
    outcome15 [label = '6.b.a.1.female', shape = box]
    outcome16 [label = '6.b.a.2.male', shape = box]
    outcome17 [label = '6.b.a.3.nondet', shape = box]
    outcome18 [label = '6.b.b.1.female', shape = box]
    outcome19 [label = '6.b.b.1.male', shape = box]
    
    # Connections
    node0 -> node15 [label = 'No']
    
    node15 -> node16 [label = 'Yes']
    node15 -> node18 [label = 'No']
    
    node16 -> outcome15 [label = 'Yes']
    node16 -> node17 [label = 'No']
    
    node17 -> outcome16 [label = 'Yes']
    node17 -> outcome17 [label = 'No']
    
    node18 -> outcome18 [label = 'c1_perc_mujer >= 0.5']
    node18 -> outcome19 [label = 'c1_perc_mujer < 0.5']
  }

  start -> node0
}
",
  width  = 1200,
  height = 900
  )



#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:
invisible("Export database to explore it")
wdpath<-
paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))))
envpath<- if(regmatches(wdpath, regexpr("[A-Za-z]+", wdpath))=="G"){"G:/Mi unidad/Alvacast/SISTRAT 2023/"}else{"E:/Mi unidad/Alvacast/SISTRAT 2023/"}
envpath
# WidthCM<-8
# HeightCM<-6
# DPI<-600
unlink(paste0(wdpath,"cons/_figs/decision_sex_inconsistencies_files"), recursive = TRUE)

htmlwidgets::saveWidget(decision_sex_inconsistencies, paste0(wdpath,"cons/_figs/decision_sex_inconsistencies.html"))
webshot::webshot(paste0(wdpath,"cons/_figs/decision_sex_inconsistencies.html"),paste0(wdpath,"cons/_figs/decision_sex_inconsistencies.png"), vwidth = 300*1.2, vheight = 300,  zoom=10, expand=100)  # Prueba con diferentes coordenadas top, left, width, and height.

Registered S3 methods overwritten by ‘callr’: method from format.callr_status_error
print.callr_status_error

Decision tree for inconsistencies in information about sex

Decision tree for inconsistencies in information about sex

[1] "G:/Mi unidad/Alvacast/SISTRAT 2023/"
Code
invalid_sex_by_patient<-
SISTRAT23_c1_2010_2022_df_prev1o|>  group_by(hash_key)|> summarise(sexo_por_hash = n_distinct(sexo), miss_sexo = sum(is.na(sexo), na.rm=T), tot_obs = n())|> ungroup()|> mutate(perc_miss_sexo = miss_sexo/tot_obs)|> filter(sexo_por_hash>1|perc_miss_sexo==1)|> pull(hash_key)

invisible("======================================================")  
invalid_sex_hashs_hosp<-  
HOSP_filter_df|> 
    filter(run %in% invalid_sex_by_patient)|>
    (\(df) {
        print(message(paste0("Hospital, Entries: ", nrow(df))))
        print(message(paste0("Hospital, RUNs: ", distinct(df, run) |> nrow())))
        df
    })()|>
    distinct(run, sexo)|>
    group_by(run)|>
    mutate(id = as.character(dplyr::row_number()))|>
    pivot_wider(names_from = id, values_from = sexo, 
                           names_prefix = "h_sex_")

Hospital, Entries: 1662

Hospital, RUNs: 372

Code
# Hospital, Entries: 1656
# NULL
# Hospital, RUNs: 371
# NULL
invisible("======================================================")
invalid_sex_top<-  
SISTRAT23_top_2015_2022_df|>
    filter(HASH_KEY %in% invalid_sex_by_patient)|>
    (\(df) {
        print(message(paste0("TOP, Entries: ", nrow(df))))
        print(message(paste0("TOP, RUNs: ", distinct(df, HASH_KEY)|> nrow())))
        df
    })()|>  
    distinct(HASH_KEY, sexo)|>
    #select(HASH_KEY, sexo)|>  
    ungroup()|>
    group_by(HASH_KEY)|>
    mutate(id = as.character(dplyr::row_number()))|>  # Convertir `id` a carácter
    pivot_wider(names_from = id, values_from = sexo, 
                           names_prefix = "t_sex_")

TOP, Entries: 1523

TOP, RUNs: 311

Code
# TOP, Entries: 1518
# NULL
# TOP, RUNs: 310
# NULL
invisible("======================================================")
invalid_sex_c2<-  
CONS_C2 |>
    filter(HASH_KEY %in% invalid_sex_by_patient)|>
    (\(df) {
        print(message(paste0("C2, Entries: ", nrow(df))))
        print(message(paste0("C2, RUNs: ", distinct(df, HASH_KEY)|> nrow())))
        df
    })()|>  
    distinct(HASH_KEY, sexo)|>
    ungroup()|>
    group_by(HASH_KEY)|>
    mutate(id = as.character(dplyr::row_number()))|>  # Convertir `id` a carácter
    pivot_wider(names_from = id, values_from = sexo, 
                           names_prefix = "c2_sex_")

C2, Entries: 0

C2, RUNs: 0

Code
# C2, Entries: 0
# NULL
# C2, RUNs: 0
# NULL
invisible("======================================================")
invalid_sex_c3<-  
CONS_C3|>
    filter(HASH_KEY %in% invalid_sex_by_patient)|>
    (\(df) {
        print(message(paste0("C3, Entries: ", nrow(df))))
        print(message(paste0("C3, RUNs: ", distinct(df, HASH_KEY)|> nrow())))
        df
    })()|>  
    distinct(HASH_KEY, sexo)|>
    ungroup()|>
    group_by(HASH_KEY)|>
    mutate(id = as.character(dplyr::row_number()))|>  # Convertir `id` a carácter
    pivot_wider(names_from = id, values_from = sexo, 
                           names_prefix = "c3_sex_")

C3, Entries: 5

C3, RUNs: 4

Code
# C3, Entries: 4
# NULL
# C3, RUNs: 4
# NULL
invisible("======================================================")
invalid_sex_c4<-  
CONS_C4 |>
    filter(HASH_KEY %in% invalid_sex_by_patient)|>
    (\(df) {
        print(message(paste0("C4, Entries: ", nrow(df))))
        print(message(paste0("C4, RUNs: ", distinct(df, HASH_KEY)|> nrow())))
        df
    })()|>  
    distinct(HASH_KEY, sexo) |>
    ungroup()|>
    group_by(HASH_KEY)|>
    mutate(id = as.character(dplyr::row_number()))|>  # Convertir `id` a carácter
    pivot_wider(names_from = id, values_from = sexo, 
                           names_prefix = "c4_sex_")

C4, Entries: 2

C4, RUNs: 1

Code
# C4, Entries: 2
# NULL
# C4, RUNs: 1
# NULL
invisible("======================================================")
invalid_sex_c5<-  
CONS_C5 |>
    filter(HASH_KEY %in% invalid_sex_by_patient)|>
    (\(df) {
        print(message(paste0("C5, Entries: ", nrow(df))))
        print(message(paste0("C5, RUNs: ", distinct(df, HASH_KEY)|> nrow())))
        df
    })() |>  
    distinct(HASH_KEY, sexo)|>
    ungroup()|>
    group_by(HASH_KEY)|>
    mutate(id = as.character(dplyr::row_number()))|>  # Convertir `id` a carácter
    pivot_wider(names_from = id, values_from = sexo, 
                           names_prefix = "c5_sex_")

C5, Entries: 1

C5, RUNs: 1

Code
# C5, Entries: 1
# NULL
# C5, RUNs: 1
# NULL
invisible("======================================================")
invalid_sex_c6<-  
CONS_C6 |>
    filter(HASH_KEY %in% invalid_sex_by_patient)|>
    (\(df) {
        print(message(paste0("C6, Entries: ", nrow(df))))
        print(message(paste0("C6, RUNs: ", distinct(df, HASH_KEY)|> nrow())))
        df
    })()|>  
    distinct(HASH_KEY, sexo)|>
    ungroup()|>
    group_by(HASH_KEY)|>
    mutate(id = as.character(dplyr::row_number()))|>  # Convertir `id` a carácter
    pivot_wider(names_from = id, values_from = sexo, 
                           names_prefix = "c6_sex_")

C6, Entries: 1

C6, RUNs: 1

Code
# C6, Entries: 1
# NULL
# C6, RUNs: 1
# NULL
invisible("======================================================")
invalid_sex_mortality<-  
mortality |>
    filter(hashkey %in% invalid_sex_by_patient)|>
    (\(df) {
        print(message(paste0("Mortality, Entries: ", nrow(df))))
        print(message(paste0("Mortality, RUNs: ", distinct(df, hashkey)|> nrow())))
        df
    })()|>  
    distinct(hashkey, sexo)|>
    ungroup()|> 
  rename("m_sexo"="sexo")

Mortality, Entries: 15

Mortality, RUNs: 15

Code
# Mortality, Entries: 15
# NULL
# Mortality, RUNs: 15
# NULL
invisible("======================================================")
invalid_sex_may23_PO_office<-  
OLD_NEW_SISTRAT23_c1_2010_2022_df2|>
  tidylog::right_join(Base_fiscalia_v2, by=c("HASH_KEY.y"="rut_enc_saf"))|> 
  select("HASH_KEY.x","HASH_KEY.y", "sexo.y")|> 
  filter(HASH_KEY.x %in% invalid_sex_by_patient)|>
    (\(df) {
        print(message(paste0("PO Office, Entries: ", nrow(df))))
        print(message(paste0("PO Office, RUNs: ", distinct(df, HASH_KEY.x) |> nrow())))
        df
    })()|>  
    group_by(HASH_KEY.x)|>
    summarise(femenino = sum(grepl("FEM", sexo.y)),masculino = sum(grepl("MASC", sexo.y)), total=n())|> 
    ungroup()|> 
    mutate(po_perc_fem = femenino / total, po_perc_masc = masculino / total)|> 
    filter(po_perc_masc<.5|po_perc_fem>.5)|>
    (\(df) {
        print(message(paste0("PO Office, only clear sexes, RUNs: ", distinct(df, HASH_KEY.x) |> nrow())))
        df
    })()

right_join: added 5 columns (sexo.x, fec_nacimiento_simple, sexo.y, avg_birth_date_po, n_dis_birth_date_po)

        > rows only in OLD_NEW_SISTRAT23_c1_20.. (   58,487)
        > rows only in Base_fiscalia_v2              18,409
        > matched rows                            1,197,510    (includes duplicates)
        >                                        ===========
        > rows total                              1,215,919

PO Office, Entries: 18235

PO Office, RUNs: 454

PO Office, only clear sexes, RUNs: 283

Code
# PO Office, Entries: 18205
# NULL
# PO Office, RUNs: 452
# NULL    
# PO Office, only clear sexes, RUNs: 282
# NULL

invisible("======================================================")
invalid_sex_c1_2324<-  
SISTRAT23_c1_2023_2024_df2|>
    filter(hash_key %in% invalid_sex_by_patient)|>
    (\(df) {
        print(message(paste0("C1, 23-24, Entries: ", nrow(df))))
        print(message(paste0("C1, 23-24, RUNs: ", distinct(df, hash_key)|> nrow())))
        df
    })()|>  
    distinct(hash_key, sexo)|>
    ungroup()|>
    group_by(hash_key)|>
    mutate(id = as.character(dplyr::row_number()))|>  # Convertir `id` a carácter
    pivot_wider(names_from = id, values_from = sexo, 
                           names_prefix = "c1_2324_sex_")

C1, 23-24, Entries: 99 C1, 23-24, RUNs: 68

Code
invisible("======================================================")
invalid_sex_c1_2324_idgen<-  
SISTRAT23_c1_2023_2024_df2|>
    filter(hash_key %in% invalid_sex_by_patient)|>
    (\(df) {
        print(message(paste0("C1, 23-24, Gender identity, Entries: ", nrow(df))))
        print(message(paste0("C1, 23-24, Gender identity, RUNs: ", distinct(df, hash_key)|> nrow())))
        df
    })()|>  
    distinct(hash_key, identidad_de_genero)|>
    ungroup()|>
    group_by(hash_key)|>
    mutate(id = as.character(dplyr::row_number()))|>  # Convertir `id` a carácter
    pivot_wider(names_from = id, values_from = identidad_de_genero, 
                           names_prefix = "c1_2324_genid_")

C1, 23-24, Gender identity, Entries: 99 C1, 23-24, Gender identity, RUNs: 68

Code
invisible("======================================================")
invalid_sex_top_2224<-  
top_2224|>
    filter(hashkey %in% invalid_sex_by_patient)|>
    (\(df) {
        print(message(paste0("TOP, 23-24, Entries: ", nrow(df))))
        print(message(paste0("TOP, 23-24, RUNs: ", distinct(df, hashkey)|> nrow())))
        df
    })()|>  
    distinct(hashkey, sexo)|>
    ungroup()|>
    group_by(hashkey)|>
    mutate(id = as.character(dplyr::row_number()))|>  # Convertir `id` a carácter
    pivot_wider(names_from = id, values_from = sexo, 
                           names_prefix = "top_2224_sex_")

TOP, 23-24, Entries: 79 TOP, 23-24, RUNs: 29

Code
invisible("======================================================")
invalid_sex_c2_2224<-  
c2_2324 |>
    filter(hashkey %in% invalid_sex_by_patient)|>
    (\(df) {
        print(message(paste0("C2, 22-24, Entries: ", nrow(df))))
        print(message(paste0("C2, 22-24, RUNs: ", distinct(df, hashkey)|> nrow())))
        df
    })()|>  
    distinct(hashkey, sexo)|>
    ungroup()|>
    group_by(hashkey)|>
    mutate(id = as.character(dplyr::row_number()))|>  # Convertir `id` a carácter
    pivot_wider(names_from = id, values_from = sexo, 
                           names_prefix = "c2_2224_sex_")

C2, 22-24, Entries: 0 C2, 22-24, RUNs: 0

Code
# C2, 22-24, Entries: 0
# NULL
# C2, 22-24, RUNs: 0
# NULL

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:

invalid_sex_ext_info<-
SISTRAT23_c1_2010_2022_df_prev1o|>
  tidytable::filter(hash_key %in% invalid_sex_by_patient)|>
  select(hash_key, sexo)|> 
  group_by(hash_key)|>
  summarise(hombre = sum(grepl("hom", sexo)),mujer = sum(grepl("muj", sexo)), total=n())|> 
  ungroup()|> 
  mutate(c1_perc_hombre = hombre / total, c1_perc_mujer = mujer / total)|>
  select(hash_key, c1_perc_hombre, c1_perc_mujer, total)|> 
  tidylog::left_join(invalid_sex_hashs_hosp, by=c("hash_key"="run"), multiple="first")|>
  tidytable::select(hash_key, c1_perc_hombre, c1_perc_mujer, total, h_sex_1, h_sex_2)|>
  tidylog::left_join(invalid_sex_top, by=c("hash_key"="HASH_KEY"), multiple="first")|>
  tidylog::left_join(invalid_sex_c2, by=c("hash_key"="HASH_KEY"), multiple="first")|> 
  tidylog::left_join(invalid_sex_c3, by=c("hash_key"="HASH_KEY"), multiple="first")|> 
  tidylog::left_join(invalid_sex_c4, by=c("hash_key"="HASH_KEY"), multiple="first")|> 
  tidylog::left_join(invalid_sex_c5, by=c("hash_key"="HASH_KEY"), multiple="first")|> 
  tidylog::left_join(invalid_sex_c6, by=c("hash_key"="HASH_KEY"), multiple="first")|>
  tidylog::left_join(invalid_sex_mortality, by=c("hash_key"="hashkey"), multiple="first")|>
  tidylog::left_join(invalid_sex_may23_PO_office[,c("HASH_KEY.x","po_perc_fem", "po_perc_masc")], by=c("hash_key"="HASH_KEY.x"), multiple="first")|> 
  tidylog::left_join(invalid_sex_c1_2324, by=c("hash_key"="hash_key"), multiple="first")|>
  tidylog::left_join(invalid_sex_top_2224, by=c("hash_key"="hashkey"), multiple="first")|>
  tidylog::left_join(invalid_sex_c2_2224, by=c("hash_key"="hashkey"), multiple="first")|>   
      (\(df) {
        print(message(paste0("Invalid sex that have at least one external sex, Entries: ", nrow(df))))
        print(message(paste0("Invalid sex that have at least one external sex, RUNs: ", tidytable::distinct(df, hash_key)|> nrow())))
        df
    })()|> 
  (\(df) {
    columns <- c("h_sex_1", "h_sex_2", "t_sex_1", "t_sex_2", "c3_sex_1", 
                "c4_sex_1", "c5_sex_1", "c6_sex_1", "m_sexo", "po_perc_fem", "po_perc_masc", "c1_2324_sex_1", "c1_2324_sex_2", "top_2224_sex_1")
    mutate(df, count_not_na = rowSums(!is.na(select(df, all_of(columns)))))
  })()|> 
  rowwise()|>
  mutate(count_fem = sum(h_sex_1==2, 
                        h_sex_2==2, #amaru= mutate(SEX= factor(SEXO, levels= c(1, 2), labels= c("male", "female")))
                        t_sex_1=="mujer", 
                        t_sex_2=="mujer",
                        c3_sex_1=="mujer",
                        c4_sex_1=="mujer",
                        c5_sex_1=="femenino",
                        c6_sex_1=="mujer",
                        m_sexo==2,
                        po_perc_fem>.5,
                        c1_2324_sex_1=="mujer",
                        c1_2324_sex_2=="mujer",
                        top_2224_sex_1=="Mujer", na.rm=T))|>
  ungroup()|>
  mutate(perc_fem_ext= count_fem/count_not_na)

left_join: added 2 columns (h_sex_1, h_sex_2) > rows only in select(mutate(ungroup(s.. 128 > rows only in invalid_sex_hashs_hosp ( 0) > matched rows 372 > ===== > rows total 500 left_join: added 2 columns (t_sex_1, t_sex_2) > rows only in tidytable::select(tidyl.. 189 > rows only in invalid_sex_top ( 0) > matched rows 311 > ===== > rows total 500 left_join: added no columns > rows only in tidylog::left_join(tidy.. 500 > rows only in invalid_sex_c2 ( 0) > matched rows 0 > ===== > rows total 500 left_join: added one column (c3_sex_1) > rows only in tidylog::left_join(tidy.. 496 > rows only in invalid_sex_c3 ( 0) > matched rows 4 > ===== > rows total 500 left_join: added one column (c4_sex_1) > rows only in tidylog::left_join(tidy.. 499 > rows only in invalid_sex_c4 ( 0) > matched rows 1 > ===== > rows total 500 left_join: added one column (c5_sex_1) > rows only in tidylog::left_join(tidy.. 499 > rows only in invalid_sex_c5 ( 0) > matched rows 1 > ===== > rows total 500 left_join: added one column (c6_sex_1) > rows only in tidylog::left_join(tidy.. 499 > rows only in invalid_sex_c6 ( 0) > matched rows 1 > ===== > rows total 500 left_join: added one column (m_sexo) > rows only in tidylog::left_join(tidy.. 485 > rows only in invalid_sex_mortality ( 0) > matched rows 15 > ===== > rows total 500 left_join: added 2 columns (po_perc_fem, po_perc_masc) > rows only in tidylog::left_join(tidy.. 217 > rows only in invalid_sex_may23_PO_of.. ( 0) > matched rows 283 > ===== > rows total 500 left_join: added 2 columns (c1_2324_sex_1, c1_2324_sex_2) > rows only in tidylog::left_join(tidy.. 432 > rows only in invalid_sex_c1_2324 ( 0) > matched rows 68 > ===== > rows total 500 left_join: added one column (top_2224_sex_1) > rows only in tidylog::left_join(tidy.. 471 > rows only in invalid_sex_top_2224 ( 0) > matched rows 29 > ===== > rows total 500 left_join: added no columns > rows only in tidylog::left_join(tidy.. 500 > rows only in invalid_sex_c2_2224 ( 0) > matched rows 0 > ===== > rows total 500 Invalid sex that have at least one external sex, Entries: 500

Invalid sex that have at least one external sex, RUNs: 500

Code
invisible("No tiene info ext")
#filter(invalid_sex_ext_info, count_not_na==0) |> nrow() #22 # june 2025 22
invisible("No tiene info ext, sólo 2 obs")
#filter(invalid_sex_ext_info, count_not_na==0, total==2) |> nrow() #11 #11
invisible("No tiene info ext, empate")
#filter(invalid_sex_ext_info, count_not_na==0, c1_perc_hombre==c1_perc_mujer) |> nrow() #11, #11 lo mismo

invalid_sex_ext_info_post<-
invalid_sex_ext_info |> #perfect and accords
  mutate(decision = case_when(count_not_na>0 & perc_fem_ext>.5 & c1_perc_mujer>.5 ~ "6.a.a.1.female",
    count_not_na>0 & c1_perc_mujer<.5 & perc_fem_ext<.5~ "6.a.a.2.male",
    #less straight evidence, but more ext info to think is a woman
    count_not_na>0 & c1_perc_mujer==.5 & perc_fem_ext>.5 & total< count_not_na~ "6.a.b.1.female",
    #less straight evidence, but more ext info to think is a man
    count_not_na>0 & c1_perc_mujer==.5 & perc_fem_ext<.5 & total< count_not_na~ "6.a.b.2.male",
    #more int info to think is a woman, external is inconsistent 
    count_not_na>0 & c1_perc_mujer>.5 & perc_fem_ext==.5 & total> count_not_na~ "6.a.b.1.female",
    #more int info to think is a man, external is inconsistent 
    count_not_na>0 & c1_perc_mujer<.5 & perc_fem_ext==.5 & total> count_not_na~ "6.a.b.2.male",
    #less straight evidence, but less ext info to think is a woman
    count_not_na>0 & c1_perc_mujer==.5 & perc_fem_ext>.5 & total> count_not_na~ "6.a.b.1.female, but ask aux data",
    #less straight evidence, but less ext info to think is a man
    count_not_na>0 & c1_perc_mujer==.5 & perc_fem_ext<.5 & total> count_not_na~ "6.a.b.2.male, but ask aux data",
    #external data that says that is a female is more frequent
    count_not_na>0 & c1_perc_mujer<.5 & perc_fem_ext>=.5 & total< count_not_na~ "6.a.b.1.female",
    #external data that says that is a male is more frequent
    count_not_na>0 & c1_perc_mujer>=.5 & perc_fem_ext<.5 & total< count_not_na~ "6.a.b.2.male",
    #external data is inconsistent, but C1 contains more data and points out to woman
    count_not_na>0 & c1_perc_mujer>.5 & perc_fem_ext==.5 & total< count_not_na~ "6.a.b.1.female, but ask aux data",
    #external data is inconsistent, but C1 contains more data and points out to man
    count_not_na>0 & c1_perc_mujer<.5 & perc_fem_ext==.5 & total< count_not_na~ "6.a.b.2.male, but ask aux data",
    #external data, same frequency of C1 data; inconsistent probabilities
    count_not_na>0 & c1_perc_mujer>=.5 & perc_fem_ext<.5 & total== count_not_na~ "6.a.b.3.nondet",
    #external data, same frequency of C1 data; C1 is inconsistent, but external points out to woman
    count_not_na>0 & c1_perc_mujer==.5 & perc_fem_ext>.5 & total ==count_not_na~ "6.a.b.1.female, but ask aux data",
    #external data, same frequency of C1 data; C1 is inconsistent, but external points out to woman
    count_not_na>0 & c1_perc_mujer==.5 & perc_fem_ext<.5 & total ==count_not_na~ "6.a.b.2.male, but ask aux data",                      
    #external data, same frequency of C1 data; C1 is points out to woman, but external is inconsistent
    count_not_na>0 & c1_perc_mujer>.5 & perc_fem_ext==.5 & total ==count_not_na~ "6.a.b.1.female, but ask aux data",
    #external data, same frequency of C1 data; C1 is points out to man, but external is inconsistent
    count_not_na>0 & c1_perc_mujer<.5 & perc_fem_ext==.5 & total ==count_not_na~ "6.a.b.2.male, but ask aux data",                      
    #C1 data that says that is a female is more frequent
    count_not_na>0 & c1_perc_mujer>.5 & perc_fem_ext<.5 & total> count_not_na~ "6.a.b.4.female, but ask aux data",
    #C1 data that says that is a male is more frequent
    count_not_na>0 & c1_perc_mujer<.5 & perc_fem_ext>.5 & total> count_not_na~ "6.a.b.5.male, but ask aux data",
    #C1 data which was more frequent had a tie
    count_not_na>0 & perc_fem_ext>.5 & c1_perc_mujer==.5 & total> count_not_na~ "6.a.b.6.nondet",
    #external data, same percentage, remaining data
    count_not_na>0 & c1_perc_mujer==.5 & perc_fem_ext==.5~ "6.a.c.nondet",
    #more female records
    count_not_na==0 & total%% 2 == 0 & c1_perc_mujer>.5~ "6.b.a.1.female",
    #more male records
    count_not_na==0 & total%% 2 == 0 & c1_perc_mujer<.5~ "6.b.a.2.male",
    #ties in female values
    count_not_na==0 & total%% 2 == 0 & c1_perc_mujer==.5~ "6.b.a.3.nondet",
    #
    count_not_na==0 & total%% 2 != 0 & c1_perc_mujer>=.5~ "6.b.b.1.female",
    
    count_not_na==0 & total%% 2 != 0 & c1_perc_mujer<.5~ "6.b.b.1.male",
    T~"no sé"))

table(invalid_sex_ext_info_post$decision) |> data.frame() |> arrange(desc(Freq)) |> 
  knitr::kable("markdown", caption= "Preliminary solve inconsistencies in sex")
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
Preliminary solve inconsistencies in sex
Var1 Freq
6.a.a.1.female 139
6.a.b.1.female 125
6.a.c.nondet 45
6.a.a.2.male 43
6.a.b.2.male, but ask aux data 38
6.a.b.1.female, but ask aux data 35
6.a.b.2.male 27
6.a.b.3.nondet 17
6.b.a.3.nondet 11
6.b.b.1.male 10
6.a.b.5.male, but ask aux data 5
6.a.b.4.female, but ask aux data 3
6.b.a.2.male 1
no sé 1

For patients whose sex remained undetermined (n= 73) or ambiguous (n= 81) after the initial classification, we used pregnancy status and program type information to aid in the final determination. We also checked information of this kind in C1 to C6 databases (about pregnancy status). At last, we also used gender identity as a proxy.

To help determine a patient’s sex when it wasn’t clear after using pregnancy status, we created a tool that examines their primary diagnosis code (ICD-10) of patients with hospitalization records. This tool (infer_sex_icd10 function) checks the code against two specific lists: one containing codes strongly linked to female conditions (like pregnancy or female-specific cancers) and another with codes strongly linked to male conditions (like prostate issues or male-specific cancers).

If a patient’s code clearly matches a pattern on only one list, the tool suggests that sex (Female or Male). If the code doesn’t match any specific pattern on either list, or if it somehow matches patterns on both lists (which indicates a potential issue with the code or patterns), the tool flags the case as undetermined based on the diagnosis alone. This flagging signals that we need to rely on other information, such as external data sources or details like pregnancy status, to make the final sex determination.

Code
invalid_sex_ext_info_post_nondet<-invalid_sex_ext_info_post |> filter(grepl("nondet", decision))

invalid_sex_ext_info_post_ask<-invalid_sex_ext_info_post |> filter(grepl("ask", decision))

c1_6_sex_ext_data<-
group_by(subset(SISTRAT23_c1_2010_2022_df_prev1o, hash_key %in% c(invalid_sex_ext_info_post_nondet$hash_key, invalid_sex_ext_info_post_ask$hash_key)), hash_key)|> summarise(n_embarazada= sum(se_trata_de_una_mujer_embarazada=="si", na.rm=T), n_emb_egr= sum(ha_estado_embarazada_egreso=="si", na.rm=T), n_prog_mujeres= sum(grepl("mujeres",tipo_de_programa),na.rm=T),.groups="drop_last") |> 
  mutate(pregnancy_c2= ifelse(hash_key %in% c(subset(CONS_C2, a_setratadeunamujerembaraza=="si", "HASH_KEY"),subset(CONS_C2, haestadoembarazadaegreso=="si", "HASH_KEY")),1,0))|>   mutate(pregnancy_c3= ifelse(hash_key %in% c(subset(CONS_C3, haestadoembarazadaegreso=="si", "HASH_KEY"),subset(CONS_C3, setratadeunamujerembarazad=="si", "HASH_KEY")),1,0))|>
  mutate(pregnancy_c4= ifelse(hash_key %in% c(subset(CONS_C4, haestadoembarazadaegreso=="si", "HASH_KEY"),subset(CONS_C4, setratadeunamujerembarazada=="si", "HASH_KEY")),1,0))|>
  mutate(pregnancy_c5= ifelse(hash_key %in% c(subset(CONS_C5, embarazo=="si", "HASH_KEY"),subset(CONS_C5, haestadoembarazadaegreso=="si", "HASH_KEY")),1,0))|>
  mutate(pregnancy_c6= ifelse(hash_key %in% c(subset(CONS_C6, haestadoembarazadaegreso=="si", "HASH_KEY"),subset(CONS_C6, setratadeunamujerembarazada=="si", "HASH_KEY")),1,0))
#c2 to c6 didnt add info

invalid_sex_ext_info_post_nondet|>
left_join(c1_6_sex_ext_data, by="hash_key")|> 
  mutate(ext_data_woman= ifelse(n_embarazada>0|n_emb_egr>0|n_prog_mujeres>0,1,0))|> 
  mutate(ext_data_woman2= ifelse(pregnancy_c2>0|pregnancy_c3>0|pregnancy_c4>0|pregnancy_c5>0|pregnancy_c6>0,1,0))|> 
    (\(df) {
    cat(paste0("Non-determined sex with pregnancy status: ", filter(df, ext_data_woman==1) |> nrow()))
      filter(df, ext_data_woman==1) |> pull(hash_key) ->> hashs_invalid_sex_nondet_pregnant
  })()
Non-determined sex with pregnancy status: 16
Code
invalid_sex_ext_info_post_ask|> 
left_join(c1_6_sex_ext_data, by="hash_key")|> 
  mutate(decision_woman= grepl("female",decision), ext_data_woman= ifelse(n_embarazada>0|n_emb_egr>0|n_prog_mujeres>0,1,0))|>
  mutate(ext_data_woman2= ifelse(pregnancy_c2>0|pregnancy_c3>0|pregnancy_c4>0|pregnancy_c5>0|pregnancy_c6>0,1,0))|> 
  #janitor::tabyl(decision_woman, ext_data_woman)|> 
    (\(df) {
    cat(paste0("Suggested as being female and with pregnancy status: ", filter(df, decision_woman==1 & ext_data_woman==1) |> nrow()))
    filter(df, decision_woman==1 & ext_data_woman==1) |> pull(hash_key) ->> hashs_invalid_sex_woman_ask_pregnant
    filter(df, decision_woman==0 & ext_data_woman==1) |> pull(hash_key) ->> hashs_invalid_sex_man_ask_pregnant
  })()
Suggested as being female and with pregnancy status: 20
Code
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
infer_sex_icd10 <- function(icd_codes) {
  # Enhanced female-specific ICD-10 patterns
  female_patterns <- c(
    "^O",              # Pregnancy/childbirth (O00-O99)
    "^C5[1-8]",        # Female genital cancers (C51-C58)
    "^D0[6-7]",        # CIS female genital (D06-D07)
    "^D2[4-8]",        # Benign female neoplasms (D24-D28)
    "^N7[0-7]",        # PID (N70-N77)
    "^N8[0-9]|^N9[0-8]", # Non-inflammatory disorders (N80-N98)
    "^Q5[0-2]",        # Congenital female anomalies (Q50-Q52)
    "^Z12\\.4",        # Female cancer screening
    "^Z3[0-9]"         # Reproductive health encounters (Z30-Z39)
  )
  
  # Enhanced male-specific ICD-10 patterns
  male_patterns <- c(
    "^N[4-5][0-9]",    # Male genital disorders (N40-N51)
    "^C6[0-3]",        # Male genital cancers (C60-C63)
    "^D29",            # Benign male neoplasms
    "^Q5[3-5]",        # Congenital male anomalies (Q53-Q55)
    "^Z12\\.5",        # Prostate screening
    "^Z41\\.2",        # Vasectomy
    "^Z90\\.7"         # Acquired absence of male genital
  )
  
  # Check matches
  is_female <- map_lgl(icd_codes, ~ any(stringr::str_detect(.x, female_patterns)))
  is_male <- map_lgl(icd_codes, ~ any(stringr::str_detect(.x, male_patterns)))
  
  case_when(
    is_female & !is_male ~ "Female",
    is_male & !is_female ~ "Male",
    is_female & is_male ~ "Conflict",
    TRUE ~ "nondet"
  )
}

cat("Classification based of ICD-10 diagnoses in hospitalizations")
Classification based of ICD-10 diagnoses in hospitalizations
Code
HOSP_filter_df|> 
  mutate(sex= infer_sex_icd10(diag1))|> 
  janitor::tabyl(sex)
    sex      n     percent
 Female  52185 0.246163788
   Male   1734 0.008179515
 nondet 158074 0.745656696
Code
# sexo Female Male nondet
#    1     99 1734 109411
#    2  52086    0  48653
#    9      0    0     10

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
HOSP_filter_df|> 
    filter(run %in% setdiff(invalid_sex_ext_info_post_ask$hash_key, hashs_invalid_sex_woman_ask_pregnant))|>
    mutate(sex= infer_sex_icd10(diag1))|> 
    (\(df) {
        print(message(paste0("Hospital, suggested sex (w/o pregnancy status), Entries: ", nrow(df))))
        print(message(paste0("Hospital, suggested sex (w/o pregnancy status), RUNs: ", distinct(df, run) |> nrow())))
        df
    })()|>
    select(run, sex)|>
    summarise(female_icd10= sum(sex=="Female", na.rm=T), male_icd10= sum(sex=="Male", na.rm=T), .by= run,.groups="drop_last")|>
    filter(female_icd10>0|male_icd10>0)|> 
      (\(df) {
    cat(paste0("Cases with a suggested sex requiring external confirmation, lacking pregnancy information, but having available hospitalization records with ICD-10 diagnoses: ", nrow(df),"\n"))
    filter(df, female_icd10>0) |> pull(run) ->> hashs_invalid_sex_woman_ask_non_pregnant_but_icd10
    filter(df, male_icd10>0) |> pull(run) ->> hashs_invalid_sex_man_ask_not_pregnant_but_icd10
  })()

Hospital, suggested sex (w/o pregnancy status), Entries: 132

NULL

Hospital, suggested sex (w/o pregnancy status), RUNs: 41

NULL
Cases with a suggested sex requiring external confirmation, lacking pregnancy information, but having available hospitalization records with ICD-10 diagnoses: 12
Code
HOSP_filter_df|> 
    filter(run %in% setdiff(invalid_sex_ext_info_post_nondet$hash_key, hashs_invalid_sex_nondet_pregnant))|>
    mutate(sex= infer_sex_icd10(diag1))|> 
    (\(df) {
        print(message(paste0("Hospital, non-determined sex (w/o pregnancy status), Entries: ", nrow(df))))
        print(message(paste0("Hospital, non-determined sex (w/o pregnancy status), RUNs: ", distinct(df, run) |> nrow())))
        df
    })()|>
    select(run, sex)|>
    summarise(female_icd10= sum(sex=="Female", na.rm=T), male_icd10= sum(sex=="Male", na.rm=T), .by= run,.groups="drop_last")|>
    filter(female_icd10>0|male_icd10>0)|> 
      (\(df) {
    cat(paste0("Cases with a non-determined sex, lacking pregnancy information, but having available hospitalization records with ICD-10 diagnoses: ", nrow(df),"\n"))
    filter(df, female_icd10>0) |> pull(run) ->> hashs_invalid_sex_woman_nondet_non_pregnant_but_icd10
    filter(df, male_icd10>0) |> pull(run) ->> hashs_invalid_sex_man_nondet_not_pregnant_but_icd10
  })()

Hospital, non-determined sex (w/o pregnancy status), Entries: 75

NULL

Hospital, non-determined sex (w/o pregnancy status), RUNs: 25

NULL
Cases with a non-determined sex, lacking pregnancy information, but having available hospitalization records with ICD-10 diagnoses: 5
Code
HOSP_filter_df|> 
    filter(run %in% hashs_invalid_sex_man_ask_pregnant)|>
    mutate(sex= infer_sex_icd10(diag1))|> 
    (\(df) {
        print(message(paste0("Hospital, suggested sex male (w/ pregnancy status), Entries: ", nrow(df))))
        print(message(paste0("Hospital, suggested sex male (w/ pregnancy status), RUNs: ", distinct(df, run) |> nrow())))
        df
    })()|>
    #distinct(run, sex)|>
    select(run, sex)|>
    summarise(female_icd10= sum(sex=="Female", na.rm=T), male_icd10= sum(sex=="Male", na.rm=T), .by= run,.groups="drop_last")|>
    filter(female_icd10>0|male_icd10>0)|> 
      (\(df) {
    cat(paste0("Cases with a suggested Male value, with pregnancy information, but having available hospitalization records with ICD-10 diagnoses: ", nrow(df),"\n"))
    filter(df, female_icd10>0) |> pull(run) ->> hashs_invalid_sex_man_woman_nondet_non_pregnant_but_icd10
    filter(df, male_icd10>0) |> pull(run) ->> hashs_invalid_sex_man_man_nondet_not_pregnant_but_icd10
  })()

Hospital, suggested sex male (w/ pregnancy status), Entries: 6

NULL

Hospital, suggested sex male (w/ pregnancy status), RUNs: 2

NULL
Cases with a suggested Male value, with pregnancy information, but having available hospitalization records with ICD-10 diagnoses: 0
Code
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
invisible("Debería añadir el sexo aquí")

#invalid_sex_c1_2324_idgen

invalid_sex_ext_info_post_hosp<-
invalid_sex_ext_info_post|> 
  mutate(decision_post= case_when(hash_key %in% 
                                    hashs_invalid_sex_woman_ask_non_pregnant_but_icd10~ "6.c.1.4.a.ask_confirmed_icd10_female",
                                  hash_key %in% hashs_invalid_sex_man_ask_not_pregnant_but_icd10~ "6.c.1.4.b.ask_confirmed_icd10_male",
                                  hash_key %in% hashs_invalid_sex_woman_nondet_non_pregnant_but_icd10~ "6.c.1.5.a.nondet_confirmed_icd10_female",
                                  hash_key %in% hashs_invalid_sex_man_nondet_not_pregnant_but_icd10~ "6.c.1.5.a.nondet_confirmed_icd10_male",
                                  hash_key %in% hashs_invalid_sex_man_woman_nondet_non_pregnant_but_icd10~ "6.c.1.6.a.nondet_inconsistent_icd10_female",
                                  hash_key %in% hashs_invalid_sex_man_man_nondet_not_pregnant_but_icd10~ "6.c.1.5.6.nondet_inconsistent_icd10_male",                             
                                  hash_key %in% hashs_invalid_sex_woman_ask_pregnant~ "6.c.1.a.ask_confirmed_female",
                                  hash_key %in% hashs_invalid_sex_man_ask_pregnant~ "6.c.1.b.ask_confirmed_male",
                                  hash_key %in% hashs_invalid_sex_nondet_pregnant~ "6.c.2.nondet_female",
                                  hash_key %in% hashs_invalid_sex_nondet_pregnant~ "6.c.3.ask_confirmed_female",T~ decision)) |> select(hash_key, decision_post)#decision

table(invalid_sex_ext_info_post_hosp$decision_post)|> 
  data.frame()|> 
  arrange(desc(Freq))|> 
  knitr::kable("markdown", caption="Decision after using hospitalization data")
Decision after using hospitalization data
Var1 Freq
6.a.a.1.female 139
6.a.b.1.female 125
6.a.a.2.male 43
6.a.b.2.male, but ask aux data 32
6.a.b.2.male 27
6.a.c.nondet 27
6.c.1.a.ask_confirmed_female 20
6.c.2.nondet_female 16
6.a.b.3.nondet 14
6.b.a.3.nondet 11
6.c.1.4.a.ask_confirmed_icd10_female 11
6.b.b.1.male 10
6.a.b.1.female, but ask aux data 9
6.c.1.b.ask_confirmed_male 6
6.c.1.5.a.nondet_confirmed_icd10_female 5
6.a.b.5.male, but ask aux data 2
6.b.a.2.male 1
6.c.1.4.b.ask_confirmed_icd10_male 1
no sé 1

Undetermined sex classifications were retained, recognizing that for these records, inferring sex from external data might be unreliable. This category could include cases where inference is difficult, as well as potential instances of gender transition.

Code
c1_inconsistent_sex_genid_10_21<- 
SISTRAT23_c1_2010_2022_df_prev1o|> 
  filter(hash_key %in% invalid_sex_ext_info_post_hosp$hash_key) |> 
  select(hash_key, identidad_de_genero) |> 
  group_by(hash_key)|> 
  summarise(prop_fem_genid_1021=sum(grepl("fem",identidad_de_genero),na.rm=T)/sum(!is.na(identidad_de_genero), na.rm=T))|> 
  ungroup()


invalid_sex_ext_info_post_hosp_c1_10_24<- 
invalid_sex_ext_info_post_hosp|> 
  left_join(c1_inconsistent_sex_genid_10_21,by="hash_key")|>
  left_join(invalid_sex_c1_2324_idgen,by="hash_key")


SISTRAT23_c1_2010_2022_df_prev1p<-
  SISTRAT23_c1_2010_2022_df_prev1o|> 
    (\(df) {
        cat(paste0("6.Number of cases before resolving inconsistencies in sex: ", formatC(nrow(df), big.mark=",")),"\n")
        cat(paste0("6.Number of patients before resolving inconsistencies in sex: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
        df
    })()|> 
  left_join(invalid_sex_ext_info_post_hosp_c1_10_24, multiple="first")|> 
  mutate(sex_rec= case_when(grepl("det$",decision_post)~ sexo, grepl("female", decision_post)~ "mujer", grepl("male", decision_post)~ "hombre", T~ sexo))|>
  #filter(is.na(sex_rec), !is.na(c1_2324_genid_1))|>
  #select(hash_key, sexo, decision_post,  prop_fem_genid_1021, c1_2324_genid_1, c1_2324_genid_2)|>
  mutate(sex_rec= case_when(is.na(sex_rec) & (prop_fem_genid_1021>.5|grepl("fem", c1_2324_genid_1))~ "mujer", T~ sex_rec))|>
  mutate(sex_rec= case_when(sex_rec=="mujer"~ "female", sex_rec=="hombre"~ "male", T~ NA_character_))|>
  #janitor::tabyl(sex_rec, sexo)
  mutate(OBS= case_when(!is.na(decision_post)~ paste0(as.character(OBS),";",decision_post), T~ OBS))|> 
  select(-decision_post)|> 
    (\(df) {
        cat(paste0("6. After after resolving inconsistencies in sex, obs.: ", formatC(nrow(df), big.mark=",")),"\n")
        cat(paste0("6. After after resolving inconsistencies in sex, RUNs: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
        if (nrow(df) > nrow(SISTRAT23_c1_2010_2022_df_prev1o))stop("Error: Added treatment episodes in the process")
        df
    })() |> 
  select(-prop_fem_genid_1021, -c1_2324_genid_1, -c1_2324_genid_2)
6.Number of cases before resolving inconsistencies in sex: 150,046 
6.Number of patients before resolving inconsistencies in sex: 106,283 
6. After after resolving inconsistencies in sex, obs.: 150,046 
6. After after resolving inconsistencies in sex, RUNs: 106,283 

The database resulting from these changes was named SISTRAT23_c1_2010_2022_df_prev1p, and the new variable containing the recoded sex information is called sex_rec.

2.2. Nationality

  • Nationality (nacionalidad) (n= 119). We created a column called nationallity_cons showing what those inconsistent nationalities are for each affected patient.
Code
invisible("no tiene perdidos, la mayoría son de chile. Por tanto, si es distinto a Chile, reemplazarlo")

invalid_nationality_by_patient<-
SISTRAT23_c1_2010_2022_df_prev1p|>  group_by(hash_key)|> summarise(nacionalidades_por_hash = n_distinct(nacionalidad), distinto_chile = sum(nacionalidad!="chile", na.rm=T), tot_obs = n())|> ungroup()|> mutate(perc_extranjero = distinto_chile/tot_obs)|> filter(nacionalidades_por_hash>1)|> pull(hash_key)

multiple_nationalities<- 
SISTRAT23_c1_2010_2022_df_prev1p|> select(hash_key, nacionalidad)|>  filter(hash_key %in% invalid_nationality_by_patient)|> 
  summarise(nacionalidad_distinct = paste(sort(unique(nacionalidad)), collapse = "; "), .by="hash_key")|>
  #mutate(rnnac= row_number(),.by="hash_key")|> pivot_wider(names_from="rnnac", values_from="nacionalidad")
  ungroup()
Code
invisible("This database is useless. We cant obtain information because there is no 1:1 linkage")

hosp_un_inv_2<-
rio::import(paste0(gsub("/cons","/data/20231205_original_data",getwd()),"/EH_2010_2022_Pasantes_v2_encrip.csv"))

#hosp_un_inv_2[,c("RUN", "ESTAB_HOMO", "FECHA_INGRESO_FMT_DEIS", "FECHA_EGRESO_FMT_DEIS", "SEXO", "EDAD_ANOS", "DIAG1", "DIAG2", "COND_EGR")]
hosp_un_inv_2_df<- hosp_un_inv_2 %>%
  mutate(DIAG2= ifelse(nchar(DIAG2)<2, NA_character_, DIAG2)) %>%
  mutate(
    KEY = paste(ESTAB_HOMO, FECHA_INGRESO_FMT_DEIS, FECHA_EGRESO_FMT_DEIS,
                SEXO, EDAD_ANOS, DIAG1, DIAG2, COND_EGR, sep = "|")
  )

#HOSP_filter_df[, c("run", "estab_homo", "fecha_ingreso", "fecha_egreso", "sexo", "edad_anos", "diag1", "diag2", "cond_egr")]
HOSP_filter_df<- HOSP_filter_df %>%
  mutate(
    KEY = paste(estab_homo, fecha_ingreso, fecha_egreso,
                sexo, edad_anos, diag1, diag2, cond_egr, sep = "|")
  )

HOSP_filter_df_join_KEY_more_one<-
  HOSP_filter_df|> 
  inner_join(hosp_un_inv_2_df, by="KEY")|> 
  group_by(KEY)|> 
  count()|> 
  ungroup()|> 
  filter(n>1)


HOSP_filter_df_join_KEY_only_one<-
  HOSP_filter_df|> 
  inner_join(hosp_un_inv_2_df, by="KEY")|> 
  group_by(KEY)|> 
  mutate(n=n())|> 
  ungroup()|> 
  filter(n==1)

# HOSP_filter_df|> 
# inner_join(hosp_un_inv_2_df, by="KEY")|> 
#   filter(GLOSA_PAIS_ORIGEN!="")|> 
#   #select(run, GLOSA_PAIS_ORIGEN)|> 
#   distinct(run, GLOSA_PAIS_ORIGEN)|> 
#   ungroup()|>
#   group_by(run)|>
#   mutate(id = as.character(dplyr::row_number()))|>  # Convertir `id` a carácter
#   pivot_wider(names_from = id, values_from = GLOSA_PAIS_ORIGEN, 
#                          names_prefix = "hosp_nat_")

2.3. Starting substance

Starting Substance (first_sub_used) (n= 13,881). For users that had only two treatments but a different starting substance, or in cases or users that had ties within most recent database or within the most recent value, we added a second and a third variable called sus_ini_2 and sus_ini_3 that contains a second starting substance. We also made sus_ini_mvv for starting substances of the most vulnerable value reported (Paste Base > Cocaine hydrochloride > Alcohol > Marijuana > Other).

Code
invalid_start_subs_hash_key<- 
  SISTRAT23_c1_2010_2022_df_prev1p|> summarise(sus_ini_por_hash = n_distinct(first_sub_used),.by=hash_key, .groups="drop_last")|> filter(sus_ini_por_hash>1)|> pull(hash_key)

cat("Number of distinct starting substances\n")
SISTRAT23_c1_2010_2022_df_prev1p|> summarise(sus_ini_por_hash = n_distinct(first_sub_used),.by=hash_key, .groups="drop_last")|> filter(sus_ini_por_hash>1)|> pull(sus_ini_por_hash)|> summary()
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:  
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:  
invisible("======================================================")
invalid_start_subs_c2<-  
    CONS_C2 |>
    filter(HASH_KEY %in% invalid_start_subs_hash_key)|>
    (\(df) {
        print(message(paste0("C2, Entries: ", nrow(df))))
        print(message(paste0("C2, RUNs: ", distinct(df, HASH_KEY)|> nrow())))
        df
    })()|>  
    distinct(HASH_KEY, sustancia_inicial)|>
    rename("start_sub"=2)|> 
    tidytable::mutate(start_sub= tidytable::case_when(grepl("coca",start_sub)~ "cocaine powder", grepl("crack|pasta",start_sub)~ "cocaine paste", grepl("marihuana",start_sub)~ "marijuana", grepl("anfeta|extasis|fenil|estimul",start_sub)~ "amphetamine-type stimulants", grepl("alucin|lsd|hongos",start_sub)~ "hallucinogens", grepl("opi|hero|metadona",start_sub)~ "opioids", grepl("sedante|hipnotico|tranquiliz",start_sub)~ "tranquilizers/hypnotics", grepl("inhalable",start_sub)~ "inhalants", grepl("esteroid|otros",start_sub)~"others", grepl("especif|cip-crc|sin consumo",start_sub)~ NA_character_, TRUE~start_sub))|>
    ungroup()|> 
    group_by(HASH_KEY)|>
    mutate(id = as.character(dplyr::row_number()))|>  # Convertir `id` a carácter
    pivot_wider(names_from = id, values_from = start_sub, 
                names_prefix = "c2_susini_")

C2, Entries: 175

C2, RUNs: 81

Code
invisible("======================================================")
invalid_start_subs_c3<-  
    CONS_C3 |>
    filter(HASH_KEY %in% invalid_start_subs_hash_key)|>
    (\(df) {
        print(message(paste0("C2, Entries: ", nrow(df))))
        print(message(paste0("C2, RUNs: ", distinct(df, HASH_KEY)|> nrow())))
        df
    })()|>  
    distinct(HASH_KEY, sustanciade_inicio)|>
    rename("start_sub"=2)|> 
    tidytable::mutate(start_sub= tidytable::case_when(grepl("coca",start_sub)~ "cocaine powder", grepl("crack|pasta",start_sub)~ "cocaine paste", grepl("marihuana",start_sub)~ "marijuana", grepl("anfeta|extasis|fenil|estimul",start_sub)~ "amphetamine-type stimulants", grepl("alucin|lsd|hongos",start_sub)~ "hallucinogens", grepl("opi|hero|metadona",start_sub)~ "opioids", grepl("sedante|hipnotico|tranquiliz",start_sub)~ "tranquilizers/hypnotics", grepl("inhalable",start_sub)~ "inhalants", grepl("esteroid|otros",start_sub)~"others", grepl("especif|cip-crc|sin consumo",start_sub)~ NA_character_, TRUE~start_sub))|>
    ungroup()|>  
    group_by(HASH_KEY)|>
    mutate(id = as.character(dplyr::row_number()))|>  # Convertir `id` a carácter
    pivot_wider(names_from = id, values_from = start_sub, 
                names_prefix = "c3_susini_")

C2, Entries: 175

C2, RUNs: 137

Code
invisible("======================================================")
invalid_start_subs_c4<-  
    CONS_C4 |>
    filter(HASH_KEY %in% invalid_start_subs_hash_key)|>
    (\(df) {
        print(message(paste0("C2, Entries: ", nrow(df))))
        print(message(paste0("C2, RUNs: ", distinct(df, HASH_KEY)|> nrow())))
        df
    })()|>  
    distinct(HASH_KEY, sustanciadeinicio)|>
    rename("start_sub"=2)|> 
    tidytable::mutate(start_sub= tidytable::case_when(grepl("coca",start_sub)~ "cocaine powder", grepl("crack|pasta",start_sub)~ "cocaine paste", grepl("marihuana",start_sub)~ "marijuana", grepl("anfeta|extasis|fenil|estimul",start_sub)~ "amphetamine-type stimulants", grepl("alucin|lsd|hongos",start_sub)~ "hallucinogens", grepl("opi|hero|metadona",start_sub)~ "opioids", grepl("sedante|hipnotico|tranquiliz",start_sub)~ "tranquilizers/hypnotics", grepl("inhalable",start_sub)~ "inhalants", grepl("esteroid|otros",start_sub)~"others", grepl("especif|cip-crc|sin consumo",start_sub)~ NA_character_, TRUE~start_sub))|>
    ungroup()|>  
    group_by(HASH_KEY)|>
    mutate(id = as.character(dplyr::row_number()))|>  # Convertir `id` a carácter
    pivot_wider(names_from = id, values_from = start_sub, 
                names_prefix = "c4_susini_")

C2, Entries: 97

C2, RUNs: 79

Code
invisible("======================================================")
invalid_start_subs_c5<-  
    CONS_C5 |>
    filter(HASH_KEY %in% invalid_start_subs_hash_key)|>
    (\(df) {
        print(message(paste0("C2, Entries: ", nrow(df))))
        print(message(paste0("C2, RUNs: ", distinct(df, HASH_KEY)|> nrow())))
        df
    })()|>  
    distinct(HASH_KEY, sustancia_inicial)|>
    rename("start_sub"=2)|> 
    tidytable::mutate(start_sub= tidytable::case_when(grepl("coca",start_sub)~ "cocaine powder", grepl("crack|pasta",start_sub)~ "cocaine paste", grepl("marihuana",start_sub)~ "marijuana", grepl("anfeta|extasis|fenil|estimul",start_sub)~ "amphetamine-type stimulants", grepl("alucin|lsd|hongos",start_sub)~ "hallucinogens", grepl("opi|hero|metadona",start_sub)~ "opioids", grepl("sedante|hipnotico|tranquiliz",start_sub)~ "tranquilizers/hypnotics", grepl("inhalable",start_sub)~ "inhalants", grepl("esteroid|otros",start_sub)~"others", grepl("especif|cip-crc|sin consumo",start_sub)~ NA_character_, TRUE~start_sub))|>
    ungroup()|>  
    group_by(HASH_KEY)|>
    mutate(id = as.character(dplyr::row_number()))|>  # Convertir `id` a carácter
    pivot_wider(names_from = id, values_from = start_sub, 
                names_prefix = "c5_susini_")

C2, Entries: 41

C2, RUNs: 32

Code
invisible("======================================================")
invalid_start_subs_c6<-  
CONS_C6|>
    filter(HASH_KEY %in% invalid_start_subs_hash_key)|>
    (\(df) {
        print(message(paste0("C6, Entries: ", nrow(df))))
        print(message(paste0("C6, RUNs: ", distinct(df, HASH_KEY)|> nrow())))
        df
    })()|>  
    distinct(HASH_KEY, sustanciadeinicio)|>
    rename("start_sub"=2)|> 
    tidytable::mutate(start_sub= tidytable::case_when(grepl("coca",start_sub)~ "cocaine powder", grepl("crack|pasta",start_sub)~ "cocaine paste", grepl("marihuana",start_sub)~ "marijuana", grepl("anfeta|extasis|fenil|estimul",start_sub)~ "amphetamine-type stimulants", grepl("alucin|lsd|hongos",start_sub)~ "hallucinogens", grepl("opi|hero|metadona",start_sub)~ "opioids", grepl("sedante|hipnotico|tranquiliz",start_sub)~ "tranquilizers/hypnotics", grepl("inhalable",start_sub)~ "inhalants", grepl("esteroid|otros",start_sub)~"others", grepl("especif|cip-crc|sin consumo",start_sub)~ NA_character_, TRUE~start_sub))|>
    ungroup()|>  
    group_by(HASH_KEY)|>
    mutate(id = as.character(dplyr::row_number()))|>  # Convertir `id` a carácter
    pivot_wider(names_from = id, values_from = start_sub, 
                           names_prefix = "c6_susini_")

C6, Entries: 42

C6, RUNs: 38

Code
invisible("======================================================")
invalid_start_subs_c2_2324<-  
    c2_2324 |>
    filter(hashkey %in% invalid_start_subs_hash_key)|>
    (\(df) {
        print(message(paste0("C2,23-24, Entries: ", nrow(df))))
        print(message(paste0("C2,23-24, RUNs: ", distinct(df, hashkey)|> nrow())))
        df
    })()|>  
    mutate(sustancia_inicial= tolower(sustancia_inicial))|> 
    distinct(hashkey, sustancia_inicial)|>
    rename("start_sub"=2)|> 
    tidytable::mutate(start_sub= tidytable::case_when(grepl("coca",start_sub)~ "cocaine powder", grepl("crack|pasta",start_sub)~ "cocaine paste", grepl("marihuana",start_sub)~ "marijuana", grepl("anfeta|extasis|fenil|estimul",start_sub)~ "amphetamine-type stimulants", grepl("alucin|lsd|hongos",start_sub)~ "hallucinogens", grepl("opi|hero|metadona",start_sub)~ "opioids", grepl("sedante|hipnotico|tranquiliz",start_sub)~ "tranquilizers/hypnotics", grepl("inhalable",start_sub)~ "inhalants", grepl("esteroid|otros",start_sub)~"others", grepl("especif|cip-crc|sin consumo",start_sub)~ NA_character_, TRUE~start_sub))|>
    ungroup()|> 
    group_by(hashkey)|>
    mutate(id = as.character(dplyr::row_number()))|>  # Convertir `id` a carácter
    pivot_wider(names_from = id, values_from = start_sub, 
                names_prefix = "c2_susini2324_")

C2,23-24, Entries: 3

C2,23-24, RUNs: 2

Code
invisible("======================================================")
invalid_start_subs_c1_2324<-  
    SISTRAT23_c1_2023_2024_df2|>
    filter(hash_key %in% invalid_start_subs_hash_key)|>
    (\(df) {
        print(message(paste0("C1,23-24, Entries: ", nrow(df))))
        print(message(paste0("C1,23-24, RUNs: ", distinct(df, hash_key)|> nrow())))
        df
    })()|>  
    distinct(hash_key, sustancia_de_inicio)|>
    rename("start_sub"=2)|> 
    tidytable::mutate(start_sub= tidytable::case_when(grepl("coca",start_sub)~ "cocaine powder", grepl("crack|pasta",start_sub)~ "cocaine paste", grepl("marihuana",start_sub)~ "marijuana", grepl("anfeta|extasis|fenil|estimul",start_sub)~ "amphetamine-type stimulants", grepl("alucin|lsd|hongos",start_sub)~ "hallucinogens", grepl("opi|hero|metadona",start_sub)~ "opioids", grepl("sedante|hipnotico|tranquiliz",start_sub)~ "tranquilizers/hypnotics", grepl("inhalable",start_sub)~ "inhalants", grepl("esteroid|otros",start_sub)~"others", grepl("especif|cip-crc|sin consumo",start_sub)~ NA_character_, TRUE~start_sub))|>
    ungroup()|> 
    group_by(hash_key)|>
    mutate(id = as.character(dplyr::row_number()))|>  # Convertir `id` a carácter
    pivot_wider(names_from = id, values_from = start_sub, 
                names_prefix = "c1_2224_susini2324_")

C1,23-24, Entries: 3243

C1,23-24, RUNs: 2127

Code
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:  
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:  
invisible("Generar jerarquías de sustancias de inicio")
#attr(table(SISTRAT23_c1_2010_2022_df_prev1p$first_sub_used),"names") |> dput()

substances<- c("alcohol", "amphetamine-type stimulants", "cocaine paste",
"cocaine powder", "hallucinogens", "inhalants", "marijuana",
"opioids", "others", "tranquilizers/hypnotics")

cat("Make counts by RUN and top substances in case of more than one initial substance\n")

invalid_start_subs_ext_info<-
SISTRAT23_c1_2010_2022_df_prev1p|> #00068c7eed2a6c21c8750f250b601cbfe29262728726655a0958c49ce64667d0 $ first_sub_used <chr> NA, "others", "alcohol"
  tidytable::filter(hash_key %in% invalid_start_subs_hash_key)|>
  select(hash_key, first_sub_used)|> 
  group_by(hash_key)|>
    summarise(
    alcohol                     = sum(first_sub_used == "alcohol", na.rm=T),
    amphetamine_type_stimulants = sum(first_sub_used == "amphetamine-type stimulants", na.rm=T),
    cocaine_paste               = sum(first_sub_used == "cocaine paste", na.rm=T),
    cocaine_powder              = sum(first_sub_used == "cocaine powder", na.rm=T),
    hallucinogens               = sum(first_sub_used == "hallucinogens", na.rm=T),
    inhalants                   = sum(first_sub_used == "inhalants", na.rm=T),
    marijuana                   = sum(first_sub_used == "marijuana", na.rm=T),
    opioids                     = sum(first_sub_used == "opioids", na.rm=T),
    others                      = sum(first_sub_used == "others", na.rm=T),
    tranquilizers_hypnotics     = sum(first_sub_used == "tranquilizers/hypnotics", na.rm=T),
    total                       = n(),       # total records per hash_key
    .groups = "drop"
  )|> 
  tidylog::left_join(invalid_start_subs_c2, by=c("hash_key"="HASH_KEY"), multiple="first")|>
  tidylog::left_join(invalid_start_subs_c3, by=c("hash_key"="HASH_KEY"), multiple="first")|> 
  tidylog::left_join(invalid_start_subs_c4, by=c("hash_key"="HASH_KEY"), multiple="first")|> 
  tidylog::left_join(invalid_start_subs_c5, by=c("hash_key"="HASH_KEY"), multiple="first")|> 
  tidylog::left_join(invalid_start_subs_c6, by=c("hash_key"="HASH_KEY"), multiple="first")|>
  tidylog::left_join(invalid_start_subs_c1_2324, by=c("hash_key"="hash_key"), multiple="first")|>
  tidylog::left_join(invalid_start_subs_c2_2324, by=c("hash_key"="hashkey"), multiple="first")|>
  (\(df) {
    # 1) coerce to a plain data.frame so base R subsetting works
    df2 <- as.data.frame(df)

    # 2) find and lowercase your susini columns
    susini_cols <- grep("^c[2-6]_susini_\\d+$", names(df2), value = TRUE)
    df2[susini_cols] <- lapply(df2[susini_cols], tolower)

    # 3) for each substance, add a new “n_<substance>” column
    for (sub in substances) {
      safe_nm <- paste0("n_", make.names(sub))
      df2[[safe_nm]] <- rowSums(df2[susini_cols] == sub, na.rm = TRUE)
    }
    
    # 4) now build the full list of count-columns to rank
    base_counts <- c(
      "alcohol",
      "amphetamine_type_stimulants",
      "cocaine_paste",
      "cocaine_powder",
      "hallucinogens",
      "inhalants",
      "marijuana",
      "opioids",
      "others",
      "tranquilizers_hypnotics"
    )
    row_counts  <- paste0("n_", make.names(substances))
    safe_names  <- c(base_counts, row_counts)

    # 5) mapping back to the human labels
    all_labels <- c(substances, substances)
    names(all_labels) <- safe_names

    # 6) compute top 3 over these combined counts
    top3 <- t(apply(df2[safe_names], 1, function(x) {
      # only positive counts
      x_pos <- x[x > 0]
      if (length(x_pos) == 0) return(rep(NA_character_, 3))
      ord    <- sort(x_pos, decreasing = TRUE)
      nm3    <- names(ord)[seq_len(min(3, length(ord)))]
      # map back & pad with NAs
      c(all_labels[nm3], rep(NA_character_, 3 - length(nm3)))
    }))

    colnames(top3) <- paste0("sus_ini_", 1:3, "_mod")
    df2 <- cbind(df2, as.data.frame(top3, stringsAsFactors = FALSE))

    df2
  })()

left_join: added 3 columns (c2_susini_1, c2_susini_2, c2_susini_3)

       > rows only in summarise(group_by(sele..  13,800
       > rows only in invalid_start_subs_c2     (     0)
       > matched rows                                81
       >                                        ========
       > rows total                              13,881

left_join: added 3 columns (c3_susini_1, c3_susini_2, c3_susini_3) > rows only in tidylog::left_join(summ.. 13,744 > rows only in invalid_start_subs_c3 ( 0) > matched rows 137 > ======== > rows total 13,881 left_join: added 2 columns (c4_susini_1, c4_susini_2) > rows only in tidylog::left_join(tidy.. 13,802 > rows only in invalid_start_subs_c4 ( 0) > matched rows 79 > ======== > rows total 13,881 left_join: added 2 columns (c5_susini_1, c5_susini_2) > rows only in tidylog::left_join(tidy.. 13,849 > rows only in invalid_start_subs_c5 ( 0) > matched rows 32 > ======== > rows total 13,881 left_join: added 2 columns (c6_susini_1, c6_susini_2) > rows only in tidylog::left_join(tidy.. 13,843 > rows only in invalid_start_subs_c6 ( 0) > matched rows 38 > ======== > rows total 13,881 left_join: added 3 columns (c1_2224_susini2324_1, c1_2224_susini2324_2, c1_2224_susini2324_3) > rows only in tidylog::left_join(tidy.. 11,754 > rows only in invalid_start_subs_c1_2.. ( 0) > matched rows 2,127 > ======== > rows total 13,881 left_join: added one column (c2_susini2324_1) > rows only in tidylog::left_join(tidy.. 13,879 > rows only in invalid_start_subs_c2_2.. ( 0) > matched rows 2 > ======== > rows total 13,881

Number of distinct starting substances
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  2.000   2.000   2.000   2.144   2.000   5.000 
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
Make counts by RUN and top substances in case of more than one initial substance

Now we generated the criteria of the most vulnerable variable.

Code
invisible("buscar criterio de mvv")

SISTRAT23_c1_2010_2022_df_prev1q<-
  SISTRAT23_c1_2010_2022_df_prev1p|> 
    (\(df) {
        cat(paste0("7.Number of cases before adding inconsistencies in nationallity & starting substance: ", formatC(nrow(df), big.mark=",")),"\n")
        cat(paste0("7.Number of patients before adding inconsistencies in nationallity & starting substance: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
        df
    })()|> 
  left_join(multiple_nationalities, multiple="first")|> 
  mutate(nationallity_cons= case_when(!is.na(nacionalidad_distinct)~ nacionalidad_distinct, T~ nacionalidad))|>
  select(-nacionalidad_distinct)|> 
    (\(df) {
        cat(paste0("7.Number of cases before adding inconsistencies in starting substance: ", formatC(nrow(df), big.mark=",")),"\n")
        cat(paste0("7.Number of patients before adding inconsistencies in starting substance: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
        df
    })()|>   
  left_join(invalid_start_subs_ext_info[, c("hash_key",paste0("sus_ini_",1:3,"_mod"))], multiple="first")|> 
  mutate(sus_ini_mod_pb=case_when(grepl("past",first_sub_used) & is.na(sus_ini_1_mod)~"cocaine paste",
    grepl("past",sus_ini_3_mod)~"cocaine paste",
    grepl("past",sus_ini_2_mod)~"cocaine paste",
    grepl("past",sus_ini_1_mod)~"cocaine paste",
    TRUE~NA_character_))|>
  mutate(sus_ini_mod_oh=case_when(grepl("alcohol",first_sub_used) & is.na(sus_ini_1_mod)~"alcohol", grepl("alcohol",sus_ini_3_mod)~"alcohol",
    grepl("alcohol",sus_ini_2_mod)~"alcohol",
    grepl("alcohol",sus_ini_1_mod)~"alcohol",
    TRUE~NA_character_))|>
  mutate(sus_ini_mod_coc=case_when(grepl("powder",first_sub_used) & is.na(sus_ini_1_mod)~"cocaine powder",
    grepl("powder",sus_ini_3_mod)~"cocaine powder",
    grepl("powder",sus_ini_2_mod)~"cocaine powder",
    grepl("powder",sus_ini_1_mod)~"cocaine powder",
    TRUE~NA_character_))|>
  mutate(sus_ini_mod_mar=case_when(grepl("marij",first_sub_used) & is.na(sus_ini_1_mod)~"marijuana",
    grepl("marij",sus_ini_3_mod)~"marijuana",
    grepl("marij",sus_ini_2_mod)~"marijuana",
    grepl("marij",sus_ini_1_mod)~"marijuana",
    TRUE~NA_character_))|>
  mutate(sus_ini_mod_otr=case_when(!grepl("alcohol|past|powder|marij",first_sub_used) & !is.na(first_sub_used) & is.na(sus_ini_1_mod)~ "others", 
   !grepl("alcohol|past|powder|marij",sus_ini_3_mod) & !is.na(sus_ini_3_mod)~"others",
   !grepl("alcohol|past|powder|marij",sus_ini_2_mod) & !is.na(sus_ini_2_mod)~"others",
   !grepl("alcohol|past|powder|marij",sus_ini_1_mod) & !is.na(sus_ini_1_mod)~"others",
   TRUE~NA_character_))|>
  mutate(sus_ini_mod_mvv=case_when(grepl("past", sus_ini_mod_pb)~"cocaine paste",
    grepl("powder",sus_ini_mod_coc)~"cocaine powder",
    grepl("marijuana",sus_ini_mod_mar)~"marijuana",
    grepl("alcohol",sus_ini_mod_oh)~"alcohol",
    grepl("oth",sus_ini_mod_otr)~"others",
    TRUE~NA_character_))|>
  #janitor::tabyl(sus_ini_mod_mvv)
  mutate(sus_ini_mod_mvv=factor(sus_ini_mod_mvv,labels=c("alcohol",
                                                                "cocaine powder",
                                                                "marijuana",
                                                                "others",
                                                                "cocaine paste")))|> 
    select(-any_of(c("sus_ini_mod_pb", "sus_ini_mod_oh", "sus_ini_mod_coc", "sus_ini_mod_mar", "sus_ini_mod_otr")))|>  
    (\(df) {
        cat(paste0("7. After after resolving inconsistencies in nationallity & starting substance, obs.: ", formatC(nrow(df), big.mark=",")),"\n")
        cat(paste0("7. After after resolving inconsistencies in nationallity & starting substance, RUNs: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
        if (nrow(df) > nrow(SISTRAT23_c1_2010_2022_df_prev1p))stop("Error: Added treatment episodes in the process")
        df
    })() |> 
  select(-sus_ini_1_mod, -sus_ini_2_mod, -sus_ini_3_mod)
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:  
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:  
#starting substance
#Segunda Sustancia de Inicio(Sólo más frecuentes) Second Starting Substance
#Tercera Sustancia de Inicio(Sólo más frecuentes) Third Starting Substance
#Sustancia de Inicio (Sólo más frecuentes)    Starting Substance (Only more frequent)
7.Number of cases before adding inconsistencies in nationallity & starting substance: 150,046 
7.Number of patients before adding inconsistencies in nationallity & starting substance: 106,283 
7.Number of cases before adding inconsistencies in starting substance: 150,046 
7.Number of patients before adding inconsistencies in starting substance: 106,283 
7. After after resolving inconsistencies in nationallity & starting substance, obs.: 150,046 
7. After after resolving inconsistencies in nationallity & starting substance, RUNs: 106,283 

Eliminate to comply with ethic concerns

Code
#Primero, se eliminará completamente la columna "Código Identificación
#de SENDA", ya que contiene información potencialmente vulnerable basada en atributos
#personales. En segundo lugar, dado que la identificación del centro de tratamiento podría
#facilitar la reidentificación indirecta de algunos pacientes, esta variable será sometida a 
#un proceso de encriptación irreversible utilizando el paquete estadístico “sodium” (v.1.4.0; Ooms, J., 2024).

To close the project, we erase polars objects.

Code
rm(list = ls()[grepl("_pl$", ls())])

Session info

Code
#|echo: true
#|error: true
#|message: true
#|paged.print: true
message(paste0("R library: ", Sys.getenv("R_LIBS_USER")))

R library: G:/My Drive/Alvacast/SISTRAT 2023/renv/library/windows/R-4.4/x86_64-w64-mingw32

Code
message(paste0("Date: ",withr::with_locale(new = c('LC_TIME' = 'C'), code =Sys.time())))

Date: 2025-06-06 16:25:34.618975

Code
message(paste0("Editor context: ", path))

Editor context: E:/Mi unidad/Alvacast/SISTRAT 2023/cons

Code
cat("quarto version: "); quarto::quarto_version()
quarto version: 
[1] '1.7.29'
Code
sesion_info <- devtools::session_info()

Warning in system2(“quarto”, “-V”, stdout = TRUE, env = paste0(“TMPDIR=”, : el comando ejecutado ‘“quarto” TMPDIR=C:/Users/andre/AppData/Local/Temp/Rtmpa25KUs/file13d385ce5280e -V’ tiene el estatus 1

Code
dplyr::select(
  tibble::as_tibble(sesion_info$packages),
  c(package, loadedversion, source)
) %>% 
  DT::datatable(filter = 'top', colnames = c('Row number' =1,'Package' = 2, 'Version'= 3),
              caption = htmltools::tags$caption(
        style = 'caption-side: top; text-align: left;',
        '', htmltools::em('R packages')),
      options=list(
initComplete = htmlwidgets::JS(
        "function(settings, json) {",
        "$(this.api().tables().body()).css({
            'font-family': 'Helvetica Neue',
            'font-size': '70%', 
            'code-inline-font-size': '15%', 
            'white-space': 'nowrap',
            'line-height': '0.75em',
            'min-height': '0.5em'
            });",
        "}")))
Code
#|echo: true
#|error: true
#|message: true
#|paged.print: true
#| class-output: center-table

reticulate::py_list_packages() %>% 
  DT::datatable(filter = 'top', colnames = c('Row number' =1,'Package' = 2, 'Version'= 3),
              caption = htmltools::tags$caption(
        style = 'caption-side: top; text-align: left;',
        '', htmltools::em('Python packages')),
      options=list(
initComplete = htmlwidgets::JS(
        "function(settings, json) {",
        "$(this.api().tables().body()).css({
            'font-family': 'Helvetica Neue',
            'font-size': '70%', 
            'code-inline-font-size': '15%', 
            'white-space': 'nowrap',
            'line-height': '0.75em',
            'min-height': '0.5em'
            });",
        "}"))) 

Error in path.expand(path): argumento ‘path’ inválido

Save

Code
wdpath<-
paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))))
envpath<- if(regmatches(wdpath, regexpr("[A-Za-z]+", wdpath))=="G"){"G:/Mi unidad/Alvacast/SISTRAT 2023/"}else{"E:/Mi unidad/Alvacast/SISTRAT 2023/"}

paste0(getwd(),"/cons")
file.path(paste0(wdpath,"data/20241015_out"))
file.path(paste0(envpath,"data/20241015_out"))

# Save
rdata_path <- file.path(wdpath, "data/20241015_out", paste0("4_ndp_", format(Sys.time(), "%Y_%m_%d"), ".Rdata"))

save.image(rdata_path)
cat("Saved in:",
    rdata_path)

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
if (Sys.getenv("RSTUDIO_SESSION_TYPE") == "server" || file.exists("/.dockerenv")) {
  password <- Sys.getenv("PASSWORD_SECRET")
} else {
  if (interactive()) {
    utils::savehistory(tempfile())
    Sys.setenv(PASSWORD_SECRET = readLines(paste0(wdpath, "secret.txt"), warn = FALSE))
    utils::loadhistory()
  }
  Sys.setenv(PASSWORD_SECRET = readLines(paste0(wdpath, "secret.txt"), warn = FALSE))
}

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
save.image(paste0(rdata_path,".enc"))

# Encriptar el archivo en el mismo lugar
httr2::secret_encrypt_file(path = paste0(rdata_path,".enc"), key = "PASSWORD_SECRET")

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
cat("Copy renv lock into cons folder\n")

if (Sys.getenv("RSTUDIO_SESSION_TYPE") == "server" || file.exists("/.dockerenv")) {
  message("Running on RStudio Server or inside Docker. Folder copy skipped.")

} else {
    
  source_folder <- 
  destination_folder <- paste0(wdpath,"cons/renv")
  
  # Copy the folder recursively
    file.copy(paste0(wdpath,"renv.lock"), paste0(wdpath,"cons/renv.lock"), overwrite = TRUE)
  
  message("Renv lock copy performed.")
}

Renv lock copy performed.

Code
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
time_after_dedup2<-Sys.time()

paste0("Time in markdown: ");time_after_dedup2-time_before_dedup2
[1] "G:/My Drive/Alvacast/SISTRAT 2023/cons/cons"
[1] "G:/My Drive/Alvacast/SISTRAT 2023//data/20241015_out"
[1] "G:/Mi unidad/Alvacast/SISTRAT 2023/data/20241015_out"
Saved in: G:/My Drive/Alvacast/SISTRAT 2023///data/20241015_out/4_ndp_2025_06_06.RdataCopy renv lock into cons folder
[1] "Time in markdown: "
Time difference of 9.14165 mins
Back to top